use std::{
collections::{HashMap, HashSet},
fs,
path::{Path, PathBuf},
sync::atomic::{AtomicBool, Ordering},
time::Duration,
};
use chromiumoxide::{
Page as CdpPage,
cdp::{
browser_protocol::{
accessibility::{AxNode, GetFullAxTreeParams, QueryAxTreeParams},
browser::{
PermissionDescriptor, PermissionSetting, SetDownloadBehaviorBehavior,
SetDownloadBehaviorParams, SetPermissionParams,
},
dom::{GetDocumentParams, ResolveNodeParams},
emulation::{
SetDeviceMetricsOverrideParams, SetGeolocationOverrideParams,
SetLocaleOverrideParams, SetTimezoneOverrideParams, SetUserAgentOverrideParams,
UserAgentBrandVersion, UserAgentMetadata,
},
input::{
DispatchKeyEventParams, DispatchKeyEventType, DispatchMouseEventParams,
DispatchMouseEventType, MouseButton,
},
network::{
Cookie, CookieParam, DeleteCookiesParams, EventResponseReceived, Headers,
ResourceType, SetExtraHttpHeadersParams,
},
page::{
AddScriptToEvaluateOnNewDocumentParams, CaptureScreenshotFormat,
EventLifecycleEvent, PrintToPdfParams, SetBypassCspParams, Viewport,
},
},
js_protocol::runtime::CallFunctionOnParams,
},
page::ScreenshotParams,
};
use futures::StreamExt;
use serde_json::Value;
use tokio::time;
use crate::{
ax::compact_outline,
error::{Result, VoidCrawlError},
stealth::StealthConfig,
};
#[derive(Debug, Clone)]
pub struct PageResponse {
pub html: String,
pub url: String,
pub status_code: Option<u16>,
pub redirected: bool,
}
#[derive(Debug, Clone, Copy)]
pub struct Bbox {
pub x: u32,
pub y: u32,
pub width: u32,
pub height: u32,
}
#[derive(Debug, Default, Clone)]
pub struct ScreenshotOptions {
pub path: Option<PathBuf>,
pub bbox: Option<Bbox>,
}
impl ScreenshotOptions {
pub fn with_path(mut self, path: impl Into<PathBuf>) -> Self {
self.path = Some(path.into());
self
}
pub fn with_bbox(mut self, bbox: Bbox) -> Self {
self.bbox = Some(bbox);
self
}
}
#[derive(Debug)]
pub enum ScreenshotOutput {
Bytes(Vec<u8>),
Path(PathBuf),
}
#[derive(Debug, Clone)]
pub struct DownloadOutcome {
pub path: PathBuf,
pub bytes: u64,
pub content_type: Option<String>,
}
#[derive(Debug)]
pub struct DownloadCapture {
dir: PathBuf,
before: HashSet<PathBuf>,
max_bytes: u64,
}
impl DownloadCapture {
pub async fn wait(self, page: &Page, timeout: Duration) -> Result<DownloadOutcome> {
let result = self.poll(timeout).await;
page.reset_download_behavior().await;
result
}
pub async fn poll(&self, timeout: Duration) -> Result<DownloadOutcome> {
wait_for_new_download(&self.dir, &self.before, self.max_bytes, timeout).await
}
}
#[derive(Debug)]
pub struct Page {
inner: CdpPage,
download_armed: AtomicBool,
}
impl Page {
pub(crate) fn new(inner: CdpPage) -> Self {
Self { inner, download_armed: AtomicBool::new(false) }
}
pub fn is_download_armed(&self) -> bool {
self.download_armed.load(Ordering::Relaxed)
}
pub(crate) async fn apply_stealth(&self, cfg: &StealthConfig) -> Result<()> {
if cfg.use_builtin_stealth {
if let Some(ua) = &cfg.user_agent {
self.inner
.enable_stealth_mode_with_agent(ua)
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
} else {
self.inner
.enable_stealth_mode()
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
}
}
let override_ua = if let Some(ua) = cfg.user_agent.clone() {
Some(ua)
} else if cfg.use_builtin_stealth {
None
} else {
probe_user_agent(&self.inner).await?.map(|ua| dehead(&ua))
};
if let Some(ua) = override_ua {
let (nav_platform, metadata) = client_hints_for_ua(&ua);
let mut builder = SetUserAgentOverrideParams::builder()
.user_agent(ua)
.accept_language(&cfg.locale)
.platform(nav_platform);
if let Some(metadata) = metadata {
builder = builder.user_agent_metadata(metadata);
}
let params = builder.build().map_err(VoidCrawlError::PageError)?;
self.inner
.execute(params)
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
}
let metrics = SetDeviceMetricsOverrideParams::new(
i64::from(cfg.viewport_width),
i64::from(cfg.viewport_height),
1.0,
false,
);
self.inner.execute(metrics).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
if cfg.bypass_csp {
let csp = SetBypassCspParams::new(true);
self.inner.execute(csp).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
}
if let Some(js) = &cfg.inject_js {
let params = AddScriptToEvaluateOnNewDocumentParams::new(js.clone());
self.inner
.execute(params)
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
}
Ok(())
}
pub async fn navigate(&self, url: &str) -> Result<()> {
self.inner.goto(url).await.map_err(|e| VoidCrawlError::NavigationFailed(e.to_string()))?;
Ok(())
}
pub async fn goto_and_wait_for_idle(
&self,
url: &str,
timeout: Duration,
) -> Result<PageResponse> {
let mut lifecycle = self
.inner
.event_listener::<EventLifecycleEvent>()
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
let mut network = self
.inner
.event_listener::<EventResponseReceived>()
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
self.inner.goto(url).await.map_err(|e| VoidCrawlError::NavigationFailed(e.to_string()))?;
let deadline = time::sleep(timeout);
tokio::pin!(deadline);
let mut status_code: Option<u16> = None;
let mut redirect_count: u32 = 0;
let mut got_almost_idle = false;
loop {
tokio::select! {
biased;
maybe_lifecycle = lifecycle.next() => {
match maybe_lifecycle {
Some(event) => match event.name.as_str() {
"networkIdle" => break,
"networkAlmostIdle" => { got_almost_idle = true; }
_ => {}
},
None => break,
}
}
maybe_network = network.next() => {
if let Some(event) = maybe_network {
if event.r#type == ResourceType::Document {
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let code = event.response.status as u16;
if (300..400).contains(&code) {
redirect_count += 1;
} else if code != 0 {
status_code = Some(code);
}
got_almost_idle = false;
}
}
}
() = &mut deadline => {
if got_almost_idle {
break;
}
let html = self.content().await.unwrap_or_default();
let final_url = self.url().await.unwrap_or_default().unwrap_or_default();
return Ok(PageResponse {
html,
url: final_url,
status_code,
redirected: redirect_count > 0,
});
}
}
}
let html = self.content().await?;
let final_url = self.url().await?.unwrap_or_default();
Ok(PageResponse { html, url: final_url, status_code, redirected: redirect_count > 0 })
}
pub async fn wait_for_navigation(&self) -> Result<()> {
self.inner
.wait_for_navigation()
.await
.map_err(|e| VoidCrawlError::NavigationFailed(e.to_string()))?;
Ok(())
}
pub async fn wait_for_network_idle(&self, timeout: Duration) -> Result<Option<String>> {
let mut events = self
.inner
.event_listener::<EventLifecycleEvent>()
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
let deadline = time::sleep(timeout);
tokio::pin!(deadline);
let mut got_almost_idle = false;
loop {
tokio::select! {
biased;
maybe_event = events.next() => {
match maybe_event {
Some(event) => {
match event.name.as_str() {
"networkIdle" => return Ok(Some("networkIdle".into())),
"networkAlmostIdle" => { got_almost_idle = true; }
_ => {} }
}
None => break, }
}
() = &mut deadline => break,
}
}
if got_almost_idle { Ok(Some("networkAlmostIdle".into())) } else { Ok(None) }
}
pub async fn wait_for_selector(&self, selector: &str, timeout: Duration) -> Result<()> {
let sel_lit = serde_json::to_string(selector)
.map_err(|e| VoidCrawlError::Other(format!("selector encode: {e}")))?;
let timeout_ms = u64::try_from(timeout.as_millis()).unwrap_or(u64::MAX);
let js = format!(
"() => new Promise((resolve, reject) => {{\
const sel = {sel_lit};\
if (document.querySelector(sel)) return resolve(true);\
const root = document.documentElement || document.body;\
const obs = new MutationObserver(() => {{\
if (document.querySelector(sel)) {{\
obs.disconnect();\
clearTimeout(t);\
resolve(true);\
}}\
}});\
obs.observe(root, {{ childList: true, subtree: true }});\
const t = setTimeout(() => {{\
obs.disconnect();\
reject(new Error('wait_for_selector timeout: ' + sel));\
}}, {timeout_ms});\
}})"
);
match self.inner.evaluate_function(js).await {
Ok(_) => Ok(()),
Err(e) => {
let msg = e.to_string();
if msg.contains("wait_for_selector timeout") {
Err(VoidCrawlError::Timeout(format!(
"selector {selector:?} did not appear within {timeout_ms}ms"
)))
} else {
Err(VoidCrawlError::JsEvalError(msg))
}
}
}
}
pub async fn content(&self) -> Result<String> {
self.inner.content().await.map_err(|e| VoidCrawlError::PageError(e.to_string()))
}
pub async fn title(&self) -> Result<Option<String>> {
self.inner.get_title().await.map_err(|e| VoidCrawlError::PageError(e.to_string()))
}
pub async fn url(&self) -> Result<Option<String>> {
self.inner.url().await.map_err(|e| VoidCrawlError::PageError(e.to_string()))
}
pub async fn evaluate_js(&self, expression: &str) -> Result<Value> {
let result = self
.inner
.evaluate(expression)
.await
.map_err(|e| VoidCrawlError::JsEvalError(e.to_string()))?;
match result.value() {
Some(v) => Ok(v.clone()),
None => Ok(Value::Null),
}
}
pub async fn screenshot_png(&self) -> Result<Vec<u8>> {
match self.screenshot(ScreenshotOptions::default()).await? {
ScreenshotOutput::Bytes(b) => Ok(b),
ScreenshotOutput::Path(_) => unreachable!("no path supplied"),
}
}
pub async fn screenshot(&self, opts: ScreenshotOptions) -> Result<ScreenshotOutput> {
let mut builder = ScreenshotParams::builder().format(CaptureScreenshotFormat::Png);
if let Some(bbox) = opts.bbox {
builder = builder.clip(Viewport {
x: f64::from(bbox.x),
y: f64::from(bbox.y),
width: f64::from(bbox.width),
height: f64::from(bbox.height),
scale: 1.0,
});
} else {
builder = builder.full_page(true);
}
let bytes = self
.inner
.screenshot(builder.build())
.await
.map_err(|e| VoidCrawlError::ScreenshotError(e.to_string()))?;
if let Some(path) = opts.path {
fs::write(&path, &bytes).map_err(|e| {
VoidCrawlError::ScreenshotError(format!("write {}: {e}", path.display()))
})?;
Ok(ScreenshotOutput::Path(path))
} else {
Ok(ScreenshotOutput::Bytes(bytes))
}
}
pub async fn pdf_bytes(&self) -> Result<Vec<u8>> {
let params = PrintToPdfParams::default();
self.inner.pdf(params).await.map_err(|e| VoidCrawlError::PdfError(e.to_string()))
}
pub async fn download_to_dir(
&self,
url: &str,
dir: &Path,
timeout: Duration,
max_bytes: u64,
) -> Result<DownloadOutcome> {
let outcome = self.run_download(url, dir, timeout, max_bytes).await;
self.reset_download_behavior().await;
outcome
}
pub async fn arm_download(&self, dir: &Path, max_bytes: u64) -> Result<DownloadCapture> {
let params = SetDownloadBehaviorParams::builder()
.behavior(SetDownloadBehaviorBehavior::AllowAndName)
.download_path(dir.to_string_lossy().into_owned())
.build()
.map_err(VoidCrawlError::PageError)?;
self.inner.execute(params).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
self.download_armed.store(true, Ordering::Relaxed);
Ok(DownloadCapture { dir: dir.to_path_buf(), before: dir_entries(dir), max_bytes })
}
pub async fn reset_download_behavior(&self) {
if let Ok(params) = SetDownloadBehaviorParams::builder()
.behavior(SetDownloadBehaviorBehavior::Default)
.build()
{
let _ = self.inner.execute(params).await;
}
self.download_armed.store(false, Ordering::Relaxed);
}
async fn run_download(
&self,
url: &str,
dir: &Path,
timeout: Duration,
max_bytes: u64,
) -> Result<DownloadOutcome> {
let before = dir_entries(dir);
let params = SetDownloadBehaviorParams::builder()
.behavior(SetDownloadBehaviorBehavior::AllowAndName)
.download_path(dir.to_string_lossy().into_owned())
.build()
.map_err(VoidCrawlError::PageError)?;
self.inner.execute(params).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
self.download_armed.store(true, Ordering::Relaxed);
if let Some(origin) = origin_of(url) {
let _ = self.inner.goto(&origin).await;
}
let url_json = serde_json::to_string(url).unwrap_or_else(|_| "''".to_string());
let js =
DOWNLOAD_JS.replace("__URL__", &url_json).replace("__MAX__", &max_bytes.to_string());
self.evaluate_js(&js).await?;
const POLL: Duration = Duration::from_millis(200);
let deadline = time::Instant::now() + timeout;
let mut settle = SettleTracker::new();
let mut content_type: Option<String> = None;
let mut done = false;
loop {
if let Ok(state) = self.evaluate_js("window.__vcDl || null").await {
if let Some(ct) = state.get("ct").and_then(|v| v.as_str()) {
content_type = Some(strip_mime_params(ct));
}
if let Some(err) = state.get("err").and_then(|v| v.as_str()) {
return Err(VoidCrawlError::Other(format!("download failed: {err}")));
}
if state.get("done").and_then(Value::as_bool) == Some(true) {
done = true;
}
}
if done {
if let Some(outcome) = settle.poll(dir, &before, max_bytes)? {
return Ok(DownloadOutcome { content_type, ..outcome });
}
}
if time::Instant::now() >= deadline {
return Err(VoidCrawlError::Timeout(format!(
"download did not complete within {}s",
timeout.as_secs()
)));
}
time::sleep(POLL).await;
}
}
pub async fn get_full_ax_tree(&self, depth: Option<i64>) -> Result<Value> {
let params = GetFullAxTreeParams { depth, frame_id: None };
let resp = self
.inner
.execute(params)
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
serde_json::to_value(&resp.result.nodes)
.map_err(|e| VoidCrawlError::PageError(e.to_string()))
}
pub async fn ax_tree_outline(&self, depth: Option<i64>) -> Result<String> {
let tree = self.get_full_ax_tree(depth).await?;
let nodes = tree.as_array().map_or(&[][..], Vec::as_slice);
Ok(compact_outline(nodes))
}
pub async fn query_ax_tree(&self, role: Option<&str>, name: Option<&str>) -> Result<Value> {
let nodes = self.query_ax_nodes(role, name).await?;
serde_json::to_value(&nodes).map_err(|e| VoidCrawlError::PageError(e.to_string()))
}
async fn query_ax_nodes(&self, role: Option<&str>, name: Option<&str>) -> Result<Vec<AxNode>> {
let doc = self
.inner
.execute(GetDocumentParams::default())
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
let params = QueryAxTreeParams {
node_id: Some(doc.result.root.node_id),
accessible_name: name.map(str::to_string),
role: role.map(str::to_string),
..Default::default()
};
let resp = self
.inner
.execute(params)
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(resp.result.nodes)
}
pub async fn click_by_role(&self, role: &str, name: &str, nth: usize) -> Result<()> {
let nodes = self.query_ax_nodes(Some(role), Some(name)).await?;
let backends: Vec<_> =
nodes.iter().filter(|n| !n.ignored).filter_map(|n| n.backend_dom_node_id).collect();
let backend_id = backends.get(nth).copied().ok_or_else(|| {
VoidCrawlError::PageError(format!(
"no AX node with role={role:?} name={name:?} at index {nth} (found {} match(es))",
backends.len()
))
})?;
let resolved = self
.inner
.execute(ResolveNodeParams { backend_node_id: Some(backend_id), ..Default::default() })
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
let object_id = resolved.result.object.object_id.ok_or_else(|| {
VoidCrawlError::PageError("AX node could not be resolved to a DOM handle".into())
})?;
let call = CallFunctionOnParams::builder()
.object_id(object_id)
.function_declaration(
"function(){ this.scrollIntoView({block:'center',inline:'center'}); this.click(); }",
)
.await_promise(false)
.build()
.map_err(VoidCrawlError::PageError)?;
self.inner.execute(call).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn set_geolocation(
&self,
latitude: f64,
longitude: f64,
accuracy: Option<f64>,
) -> Result<()> {
let grant = SetPermissionParams {
permission: PermissionDescriptor::new("geolocation"),
setting: PermissionSetting::Granted,
origin: None,
embedded_origin: None,
browser_context_id: None,
};
self.inner.execute(grant).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
let params = SetGeolocationOverrideParams {
latitude: Some(latitude),
longitude: Some(longitude),
accuracy: Some(accuracy.unwrap_or(50.0)),
..Default::default()
};
self.inner.execute(params).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn set_locale(&self, locale: &str) -> Result<()> {
let params = SetLocaleOverrideParams { locale: Some(locale.to_string()) };
self.inner.execute(params).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn set_timezone(&self, timezone_id: &str) -> Result<()> {
let params = SetTimezoneOverrideParams::new(timezone_id.to_string());
self.inner.execute(params).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn query_selector(&self, selector: &str) -> Result<Option<String>> {
let js = format!(
"(function(){{ var el = document.querySelector({selector:?}); \
return el === null ? null : el.innerHTML; }})()"
);
let result = self
.inner
.evaluate_expression(js)
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
let val: Value = match result.into_value() {
Ok(v) => v,
Err(_) => return Ok(None),
};
match val {
Value::Null => Ok(None),
Value::String(s) => Ok(Some(s)),
other => Ok(Some(other.to_string())),
}
}
pub async fn query_selector_all(&self, selector: &str) -> Result<Vec<String>> {
let js = format!("[...document.querySelectorAll({selector:?})].map(e => e.innerHTML)");
let val: Value = self
.inner
.evaluate_expression(js)
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?
.into_value()
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
match val {
Value::Array(arr) => Ok(arr
.into_iter()
.map(|v| match v {
Value::String(s) => s,
other => other.to_string(),
})
.collect()),
_ => Ok(Vec::new()),
}
}
pub async fn click_element(&self, selector: &str) -> Result<()> {
let el = self
.inner
.find_element(selector)
.await
.map_err(|e| VoidCrawlError::ElementNotFound(e.to_string()))?;
el.click().await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn type_into(&self, selector: &str, text: &str) -> Result<()> {
let el = self
.inner
.find_element(selector)
.await
.map_err(|e| VoidCrawlError::ElementNotFound(e.to_string()))?;
el.focus().await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
el.type_str(text).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn set_headers(&self, headers: HashMap<String, String>) -> Result<()> {
let json_val =
serde_json::to_value(&headers).map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
let params = SetExtraHttpHeadersParams::new(Headers::new(json_val));
self.inner.execute(params).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn get_cookies(&self) -> Result<Vec<Cookie>> {
self.inner.get_cookies().await.map_err(|e| VoidCrawlError::PageError(e.to_string()))
}
pub async fn set_cookie(&self, cookie: CookieParam) -> Result<()> {
self.inner
.set_cookie(cookie)
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn set_cookies(&self, cookies: Vec<CookieParam>) -> Result<()> {
self.inner
.set_cookies(cookies)
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn delete_cookies(&self, cookies: Vec<DeleteCookiesParams>) -> Result<()> {
self.inner
.delete_cookies(cookies)
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub async fn dispatch_mouse_event(
&self,
event_type: DispatchMouseEventType,
x: f64,
y: f64,
button: Option<MouseButton>,
click_count: Option<i64>,
delta_x: Option<f64>,
delta_y: Option<f64>,
modifiers: Option<i64>,
) -> Result<()> {
let mut builder = DispatchMouseEventParams::builder().r#type(event_type).x(x).y(y);
if let Some(b) = button {
builder = builder.button(b);
}
if let Some(c) = click_count {
builder = builder.click_count(c);
}
if let Some(dx) = delta_x {
builder = builder.delta_x(dx);
}
if let Some(dy) = delta_y {
builder = builder.delta_y(dy);
}
if let Some(m) = modifiers {
builder = builder.modifiers(m);
}
let params = builder.build().map_err(VoidCrawlError::PageError)?;
self.inner.execute(params).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn dispatch_key_event(
&self,
event_type: DispatchKeyEventType,
key: Option<&str>,
code: Option<&str>,
text: Option<&str>,
modifiers: Option<i64>,
) -> Result<()> {
let mut builder = DispatchKeyEventParams::builder().r#type(event_type);
if let Some(k) = key {
builder = builder.key(k);
}
if let Some(c) = code {
builder = builder.code(c);
}
if let Some(t) = text {
builder = builder.text(t);
}
if let Some(m) = modifiers {
builder = builder.modifiers(m);
}
let params = builder.build().map_err(VoidCrawlError::PageError)?;
self.inner.execute(params).await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub async fn close(self) -> Result<()> {
self.inner.close().await.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
Ok(())
}
pub fn inner(&self) -> &CdpPage {
&self.inner
}
}
const DOWNLOAD_JS: &str = r"(() => {
window.__vcDl = { ct: null, err: null, done: false };
(async () => {
try {
const MAX = __MAX__;
const ctrl = new AbortController();
const resp = await fetch(__URL__, { credentials: 'include', signal: ctrl.signal });
window.__vcDl.ct = resp.headers.get('content-type');
const cl = resp.headers.get('content-length');
if (cl && Number(cl) > MAX) { ctrl.abort(); throw new Error('content-length ' + cl + ' exceeds limit ' + MAX); }
let blob;
if (resp.body && resp.body.getReader) {
const reader = resp.body.getReader();
const chunks = []; let total = 0;
for (;;) {
const { done, value } = await reader.read();
if (done) break;
total += value.byteLength;
if (total > MAX) { ctrl.abort(); throw new Error('exceeded size limit ' + MAX + ' bytes'); }
chunks.push(value);
}
blob = new Blob(chunks);
} else {
blob = await resp.blob();
if (blob.size > MAX) throw new Error('exceeded size limit ' + MAX + ' bytes');
}
const a = document.createElement('a');
a.href = URL.createObjectURL(blob);
a.download = (__URL__.split(/[?#]/)[0].split('/').pop()) || 'download';
(document.body || document.documentElement).appendChild(a);
a.click();
window.__vcDl.done = true;
} catch (e) {
window.__vcDl.err = String((e && e.message) || e);
}
})();
return true;
})()";
fn strip_mime_params(mime: &str) -> String {
mime.split(';').next().unwrap_or(mime).trim().to_ascii_lowercase()
}
fn origin_of(url: &str) -> Option<String> {
let (scheme, rest) = url.split_once("://")?;
let host = rest.split(['/', '?', '#']).next()?;
if host.is_empty() {
return None;
}
Some(format!("{scheme}://{host}"))
}
fn dir_entries(dir: &Path) -> HashSet<PathBuf> {
fs::read_dir(dir).into_iter().flatten().flatten().map(|e| e.path()).collect()
}
fn new_complete_files(dir: &Path, before: &HashSet<PathBuf>) -> Vec<(PathBuf, u64)> {
let Ok(rd) = fs::read_dir(dir) else { return Vec::new() };
rd.flatten()
.filter_map(|entry| {
let path = entry.path();
if before.contains(&path) || path.extension().is_some_and(|e| e == "crdownload") {
return None;
}
match entry.metadata() {
Ok(m) if m.is_file() && m.len() > 0 => Some((path, m.len())),
_ => None,
}
})
.collect()
}
const SETTLE_SIGHTINGS: u32 = 3;
struct SettleTracker {
prev: Option<(PathBuf, u64)>,
stable: u32,
}
impl SettleTracker {
fn new() -> Self {
Self { prev: None, stable: 0 }
}
fn poll(
&mut self,
dir: &Path,
before: &HashSet<PathBuf>,
max_bytes: u64,
) -> Result<Option<DownloadOutcome>> {
let files = new_complete_files(dir, before);
if files.len() > 1 {
let names = files
.iter()
.filter_map(|(p, _)| p.file_name().map(|n| n.to_string_lossy().into_owned()))
.collect::<Vec<_>>()
.join(", ");
return Err(VoidCrawlError::Other(format!(
"ambiguous download: {} new files appeared ({names}); expected exactly one",
files.len()
)));
}
let Some((path, size)) = files.into_iter().next() else {
self.prev = None;
self.stable = 0;
return Ok(None);
};
if self.prev.as_ref().is_some_and(|(p, s)| *p == path && *s == size) {
self.stable += 1;
} else {
self.prev = Some((path.clone(), size));
self.stable = 1;
}
if self.stable < SETTLE_SIGHTINGS {
return Ok(None);
}
if size > max_bytes {
let _ = fs::remove_file(&path);
return Err(VoidCrawlError::Other(format!(
"download is {size} bytes, over the {max_bytes}-byte limit"
)));
}
Ok(Some(DownloadOutcome { path, bytes: size, content_type: None }))
}
}
async fn wait_for_new_download(
dir: &Path,
before: &HashSet<PathBuf>,
max_bytes: u64,
timeout: Duration,
) -> Result<DownloadOutcome> {
const POLL: Duration = Duration::from_millis(250);
let deadline = time::Instant::now() + timeout;
let mut settle = SettleTracker::new();
loop {
if let Some(outcome) = settle.poll(dir, before, max_bytes)? {
return Ok(outcome);
}
if time::Instant::now() >= deadline {
return Err(VoidCrawlError::Timeout(format!(
"no download completed within {}s",
timeout.as_secs()
)));
}
time::sleep(POLL).await;
}
}
async fn probe_user_agent(page: &CdpPage) -> Result<Option<String>> {
let probe = page
.evaluate("navigator.userAgent")
.await
.map_err(|e| VoidCrawlError::PageError(e.to_string()))?;
match probe.value().cloned() {
Some(Value::String(ua)) => Ok(Some(ua)),
_ => Ok(None),
}
}
fn dehead(ua: &str) -> String {
if ua.contains("HeadlessChrome") {
ua.replace("HeadlessChrome", "Chrome")
} else if ua.contains("Headless") {
ua.replace("Headless", "")
} else {
ua.to_string()
}
}
fn client_hints_for_ua(ua: &str) -> (String, Option<UserAgentMetadata>) {
let (nav_platform, ch_platform, platform_version) = if ua.contains("Windows") {
("Win32", "Windows", "15.0.0")
} else if ua.contains("Mac OS X") || ua.contains("Macintosh") {
("MacIntel", "macOS", "14.5.0")
} else {
("Linux x86_64", "Linux", "6.8.0")
};
let chrome_ver: Option<&str> =
ua.split("Chrome/").nth(1).and_then(|s| s.split_whitespace().next());
let major: Option<&str> = chrome_ver.and_then(|v| v.split('.').next());
let mut builder = UserAgentMetadata::builder()
.platform(ch_platform)
.platform_version(platform_version)
.architecture("x86")
.model("")
.mobile(false)
.bitness("64")
.wow64(false);
if let (Some(major), Some(full)) = (major, chrome_ver) {
builder = builder
.brands([
UserAgentBrandVersion::new("Chromium", major),
UserAgentBrandVersion::new("Google Chrome", major),
UserAgentBrandVersion::new("Not_A Brand", "24"),
])
.full_version_lists([
UserAgentBrandVersion::new("Chromium", full),
UserAgentBrandVersion::new("Google Chrome", full),
UserAgentBrandVersion::new("Not_A Brand", "24.0.0.0"),
]);
}
(nav_platform.to_string(), builder.build().ok())
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic, reason = "test harness")]
mod download_tests {
use std::{fs, path::Path};
use super::{SETTLE_SIGHTINGS, SettleTracker, dir_entries, new_complete_files};
fn touch(dir: &Path, name: &str, bytes: usize) {
fs::write(dir.join(name), vec![0u8; bytes]).unwrap();
}
#[test]
fn new_complete_files_excludes_before_crdownload_and_empty() {
let d = tempfile::tempdir().unwrap();
touch(d.path(), "old.bin", 10);
let before = dir_entries(d.path());
touch(d.path(), "new.bin", 10);
touch(d.path(), "partial.crdownload", 10);
touch(d.path(), "empty.bin", 0);
let files = new_complete_files(d.path(), &before);
assert_eq!(files.len(), 1);
assert_eq!(files[0].0.file_name().unwrap(), "new.bin");
}
#[test]
fn settle_requires_stable_samples_then_accepts() {
let d = tempfile::tempdir().unwrap();
let before = dir_entries(d.path());
touch(d.path(), "f.bin", 100);
let mut s = SettleTracker::new();
for _ in 0..(SETTLE_SIGHTINGS - 1) {
assert!(s.poll(d.path(), &before, 1_000).unwrap().is_none());
}
assert_eq!(s.poll(d.path(), &before, 1_000).unwrap().unwrap().bytes, 100);
}
#[test]
fn settle_resets_when_size_still_changing() {
let d = tempfile::tempdir().unwrap();
let before = dir_entries(d.path());
touch(d.path(), "f.bin", 10);
let mut s = SettleTracker::new();
s.poll(d.path(), &before, 1_000).unwrap();
touch(d.path(), "f.bin", 20); assert!(s.poll(d.path(), &before, 1_000).unwrap().is_none());
}
#[test]
fn settle_rejects_and_deletes_oversize() {
let d = tempfile::tempdir().unwrap();
let before = dir_entries(d.path());
touch(d.path(), "big.bin", 50);
let mut s = SettleTracker::new();
let mut last = Ok(None);
for _ in 0..SETTLE_SIGHTINGS {
last = s.poll(d.path(), &before, 8);
}
assert!(last.is_err());
assert!(!d.path().join("big.bin").exists(), "oversize file should be deleted");
}
#[test]
fn settle_errors_on_multiple_new_files() {
let d = tempfile::tempdir().unwrap();
let before = dir_entries(d.path());
touch(d.path(), "a.bin", 10);
touch(d.path(), "b.bin", 10);
let mut s = SettleTracker::new();
assert!(s.poll(d.path(), &before, 1_000).is_err());
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used, reason = "test harness")]
mod tests {
use super::{client_hints_for_ua, dehead};
const LINUX_UA: &str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36";
const WIN_UA: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36";
const MAC_UA: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36";
#[test]
fn dehead_strips_headless_token() {
assert_eq!(
dehead("Mozilla/5.0 HeadlessChrome/148.0.0.0 Safari"),
"Mozilla/5.0 Chrome/148.0.0.0 Safari"
);
assert_eq!(dehead(LINUX_UA), LINUX_UA);
}
#[test]
fn platform_matches_ua_os() {
assert_eq!(client_hints_for_ua(LINUX_UA).0, "Linux x86_64");
assert_eq!(client_hints_for_ua(WIN_UA).0, "Win32");
assert_eq!(client_hints_for_ua(MAC_UA).0, "MacIntel");
let md = client_hints_for_ua(LINUX_UA).1.unwrap();
assert_eq!(md.platform, "Linux");
assert!(!md.mobile);
assert_eq!(md.architecture, "x86");
}
#[test]
fn brands_carry_chrome_major_version() {
let md = client_hints_for_ua(LINUX_UA).1.unwrap();
let brands = md.brands.unwrap();
assert!(brands.iter().any(|b| b.brand == "Google Chrome" && b.version == "148"));
assert!(brands.iter().any(|b| b.brand == "Chromium" && b.version == "148"));
assert_eq!(brands.len(), 3);
let full = md.full_version_list.unwrap();
assert!(full.iter().any(|b| b.brand == "Google Chrome" && b.version == "148.0.0.0"));
}
#[test]
fn non_chrome_ua_has_no_brands() {
let firefox = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0";
let (nav_platform, md) = client_hints_for_ua(firefox);
assert_eq!(nav_platform, "Linux x86_64");
assert!(md.unwrap().brands.is_none());
}
}