scrapebadger 0.2.0

Async Rust SDK and CLI for the ScrapeBadger web-scraping API (Amazon, Google, Twitter/X, Reddit, Vinted, Web Scraping).
Documentation
// @generated by `cargo run -p xtask -- gen` from specs/web-scraping.json — do not edit by hand.
#![allow(clippy::all)]
#![allow(
    dead_code,
    unused_imports,
    unused_variables,
    non_snake_case,
    rustdoc::all
)]

use crate::core::{Client, Error, Method, QueryParams, Result};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::collections::HashMap;

/// Handle for the `web` platform. Obtain via [`crate::ScrapeBadger::web`].
#[derive(Clone)]
pub struct Web {
    client: Client,
}

impl Web {
    pub(crate) fn new(client: Client) -> Self {
        Self { client }
    }

    /// Access the underlying transport client.
    pub fn client(&self) -> &Client {
        &self.client
    }

    /// Detect Protection
    ///
    /// Analyze a URL for anti-bot and CAPTCHA systems without performing a full scrape.
    /// `POST /v1/web/detect`
    pub async fn detect_protection(
        &self,
        params: DetectProtectionParams,
    ) -> Result<DetectProtectionResponse> {
        let path = "/v1/web/detect".to_string();
        let query: Vec<(String, String)> = Vec::new();
        let mut body = serde_json::Map::new();
        if let Some(v) = &params.country {
            body.insert("country".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.timeout {
            body.insert("timeout".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.url {
            body.insert("url".to_string(), serde_json::json!(v));
        }
        let body = if body.is_empty() {
            None
        } else {
            Some(Value::Object(body))
        };
        self.client.send(Method::POST, &path, &query, body).await
    }

    /// Scrape URL
    ///
    /// Scrape a webpage and return its content as HTML, Markdown, or plain text.
    /// `POST /v1/web/scrape`
    pub async fn scrape_url(&self, params: ScrapeUrlParams) -> Result<ScrapeUrlResponse> {
        let path = "/v1/web/scrape".to_string();
        let query: Vec<(String, String)> = Vec::new();
        let mut body = serde_json::Map::new();
        if let Some(v) = &params.ai_extract {
            body.insert("ai_extract".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.ai_prompt {
            body.insert("ai_prompt".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.anti_bot {
            body.insert("anti_bot".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.country {
            body.insert("country".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.custom_headers {
            body.insert("custom_headers".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.engine {
            body.insert("engine".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.escalate {
            body.insert("escalate".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.format {
            body.insert("format".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.js_scenario {
            body.insert("js_scenario".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.max_cost {
            body.insert("max_cost".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.render_js {
            body.insert("render_js".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.retry_count {
            body.insert("retry_count".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.retry_on_block {
            body.insert("retry_on_block".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.screenshot {
            body.insert("screenshot".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.session_id {
            body.insert("session_id".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.url {
            body.insert("url".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.video {
            body.insert("video".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.wait_after_load {
            body.insert("wait_after_load".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.wait_for {
            body.insert("wait_for".to_string(), serde_json::json!(v));
        }
        if let Some(v) = &params.wait_timeout {
            body.insert("wait_timeout".to_string(), serde_json::json!(v));
        }
        let body = if body.is_empty() {
            None
        } else {
            Some(Value::Object(body))
        };
        self.client.send(Method::POST, &path, &query, body).await
    }
}

// ===== Models =====

#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(default)]
pub struct DetectProtectionResponse {
    pub antibot_systems: Vec<Value>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub blocking_type: Option<String>,
    pub captcha_systems: Vec<Value>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_i64")]
    pub credits_used: Option<i64>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_i64")]
    pub duration_ms: Option<i64>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_bool")]
    pub is_blocked: Option<bool>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub recommendation: Option<String>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub url: Option<String>,
    /// Fields present in the response but not in the spec.
    #[serde(flatten)]
    pub extra: HashMap<String, Value>,
}

/// Scraping engine tier to use.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub enum ScrapeUrlEngine {
    /// `auto`
    #[serde(rename = "auto")]
    Auto,
    /// `browser`
    #[serde(rename = "browser")]
    Browser,
}

impl std::fmt::Display for ScrapeUrlEngine {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(match self {
            ScrapeUrlEngine::Auto => "auto",
            ScrapeUrlEngine::Browser => "browser",
        })
    }
}

/// Output format for the scraped content.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub enum ScrapeUrlFormat {
    /// `html`
    #[serde(rename = "html")]
    Html,
    /// `markdown`
    #[serde(rename = "markdown")]
    Markdown,
    /// `text`
    #[serde(rename = "text")]
    Text,
}

impl std::fmt::Display for ScrapeUrlFormat {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(match self {
            ScrapeUrlFormat::Html => "html",
            ScrapeUrlFormat::Markdown => "markdown",
            ScrapeUrlFormat::Text => "text",
        })
    }
}

#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(default)]
pub struct ScrapeUrlResponse {
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub ai_error: Option<String>,
    pub ai_extraction: Option<Value>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub ai_model: Option<String>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_bool")]
    pub anti_bot_solved: Option<bool>,
    pub antibot_systems: Vec<Value>,
    pub blocking_details: Option<HashMap<String, Value>>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_bool")]
    pub blocking_detected: Option<bool>,
    pub captcha_systems: Vec<Value>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub content: Option<String>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_i64")]
    pub content_length: Option<i64>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_i64")]
    pub credits_used: Option<i64>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_i64")]
    pub duration_ms: Option<i64>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub engine_used: Option<String>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub format: Option<String>,
    pub headers: HashMap<String, Value>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_i64")]
    pub retries_used: Option<i64>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub screenshot_url: Option<String>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub solver_used: Option<String>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_i64")]
    pub status_code: Option<i64>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_bool")]
    pub success: Option<bool>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub url: Option<String>,
    #[serde(default, deserialize_with = "crate::core::flex::opt_string")]
    pub video_url: Option<String>,
    /// Fields present in the response but not in the spec.
    #[serde(flatten)]
    pub extra: HashMap<String, Value>,
}

/// Parameters for [`DetectProtectionParams`]. All fields optional; required ones are noted per method.
#[derive(Debug, Clone, Default, Serialize)]
pub struct DetectProtectionParams {
    /// ISO 3166-1 alpha-2 country code for proxy geo-targeting.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub country: Option<String>,
    /// Request timeout in milliseconds.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub timeout: Option<i64>,
    /// The URL to analyze.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub url: Option<String>,
}

/// Parameters for [`ScrapeUrlParams`]. All fields optional; required ones are noted per method.
#[derive(Debug, Clone, Default, Serialize)]
pub struct ScrapeUrlParams {
    /// Run AI extraction on scraped content.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub ai_extract: Option<bool>,
    /// Natural language instruction for AI extraction.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub ai_prompt: Option<String>,
    /// Attempt anti-bot bypass when blocking detected.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub anti_bot: Option<bool>,
    /// ISO 3166-1 alpha-2 country code for proxy geo-targeting.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub country: Option<String>,
    /// Additional HTTP headers for the target request.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub custom_headers: Option<HashMap<String, Value>>,
    /// Scraping engine tier to use.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub engine: Option<ScrapeUrlEngine>,
    /// Allow auto-escalation to stronger engines.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub escalate: Option<bool>,
    /// Output format for the scraped content.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub format: Option<ScrapeUrlFormat>,
    /// Browser actions to perform before extracting.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub js_scenario: Option<Vec<HashMap<String, Value>>>,
    /// Maximum credits budget for this request.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub max_cost: Option<i64>,
    /// Force JavaScript rendering.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub render_js: Option<bool>,
    /// Max retry attempts on blocking detection.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub retry_count: Option<i64>,
    /// Auto-retry on blocking page detection.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub retry_on_block: Option<bool>,
    /// Capture a full-page PNG screenshot.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub screenshot: Option<bool>,
    /// Persist cookies and state across requests.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub session_id: Option<String>,
    /// The URL to scrape. Must be a valid HTTP or HTTPS URL.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub url: Option<String>,
    /// Record browser session as animated GIF (+3 credits).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub video: Option<bool>,
    /// Additional ms to wait after page load.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub wait_after_load: Option<i64>,
    /// CSS selector or XPath to wait for before extracting.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub wait_for: Option<String>,
    /// Max wait time in ms for wait_for selector.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub wait_timeout: Option<i64>,
}