use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ScrapeRequest {
pub url: String,
#[serde(default = "default_formats")]
pub formats: Vec<String>,
#[serde(default)]
pub headers: HashMap<String, String>,
#[serde(default)]
pub include_tags: Vec<String>,
#[serde(default)]
pub exclude_tags: Vec<String>,
#[serde(default = "default_true")]
pub only_main_content: bool,
#[serde(default = "default_timeout")]
pub timeout: u64,
#[serde(default)]
pub wait_for: u64,
#[serde(default = "default_true")]
pub remove_base64_images: bool,
#[serde(default)]
pub skip_tls_verification: bool,
#[serde(default = "default_engine")]
pub engine: String,
#[serde(default)]
pub wait_for_selector: Option<String>,
#[serde(default)]
pub actions: Vec<BrowserAction>,
#[serde(default)]
pub screenshot: bool,
#[serde(default = "default_screenshot_format")]
pub screenshot_format: String,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(tag = "type", rename_all = "camelCase")]
pub enum BrowserAction {
Click { selector: String },
Type { selector: String, text: String },
Scroll { direction: String },
Wait { milliseconds: u64 },
WaitForSelector { selector: String },
}
fn default_formats() -> Vec<String> {
vec!["markdown".to_string()]
}
fn default_true() -> bool {
true
}
fn default_timeout() -> u64 {
30000
}
fn default_engine() -> String {
"auto".to_string()
}
fn default_screenshot_format() -> String {
"png".to_string()
}
impl Default for ScrapeRequest {
fn default() -> Self {
Self {
url: String::new(),
formats: default_formats(),
headers: HashMap::new(),
include_tags: Vec::new(),
exclude_tags: Vec::new(),
only_main_content: default_true(),
timeout: default_timeout(),
wait_for: 0,
remove_base64_images: default_true(),
skip_tls_verification: false,
engine: default_engine(),
wait_for_selector: None,
actions: Vec::new(),
screenshot: false,
screenshot_format: default_screenshot_format(),
}
}
}
#[derive(Debug, Clone, Serialize)]
pub struct ScrapeResponse {
pub success: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub warning: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub data: Option<Document>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub scrape_id: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Document {
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub url: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub markdown: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub html: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub raw_html: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub links: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub images: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub screenshot: Option<String>,
pub metadata: Metadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Metadata {
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub keywords: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub robots: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub og_title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub og_description: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub og_url: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub og_image: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub url: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub source_url: Option<String>,
pub status_code: u16,
#[serde(skip_serializing_if = "Option::is_none")]
pub content_type: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub canonical_url: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub word_count: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub reading_time: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub excerpt: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub detected_frameworks: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub detection_reason: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub content_script_ratio: Option<f64>,
}
impl Default for Metadata {
fn default() -> Self {
Self {
title: None,
description: None,
language: None,
keywords: None,
robots: None,
og_title: None,
og_description: None,
og_url: None,
og_image: None,
url: None,
source_url: None,
status_code: 200,
content_type: None,
canonical_url: None,
word_count: None,
reading_time: None,
excerpt: None,
detected_frameworks: None,
detection_reason: None,
content_script_ratio: None,
}
}
}
fn default_true_option() -> Option<bool> {
Some(true)
}
impl ScrapeResponse {
pub fn success(data: Document) -> Self {
Self {
success: true,
warning: None,
data: Some(data),
error: None,
scrape_id: None,
}
}
pub fn error(error: String) -> Self {
Self {
success: false,
warning: None,
data: None,
error: Some(error),
scrape_id: None,
}
}
}
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct MapRequest {
pub url: String,
#[serde(default)]
pub search: Option<String>,
#[serde(default)]
pub ignore_sitemap: Option<bool>,
#[serde(default = "default_include_subdomains")]
pub include_subdomains: Option<bool>,
#[serde(default = "default_map_limit")]
pub limit: Option<u32>,
}
#[derive(Debug, Clone, Serialize)]
pub struct MapResponse {
pub success: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub links: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub scrape_id: Option<String>,
}
fn default_include_subdomains() -> Option<bool> {
Some(true)
}
fn default_map_limit() -> Option<u32> {
Some(5000)
}
impl MapResponse {
pub fn success(links: Vec<String>) -> Self {
Self {
success: true,
links: Some(links),
error: None,
scrape_id: None,
}
}
pub fn error(error: String) -> Self {
Self {
success: false,
links: None,
error: Some(error),
scrape_id: None,
}
}
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct CrawlRequest {
pub url: String,
#[serde(default)]
pub exclude_paths: Option<Vec<String>>,
#[serde(default)]
pub include_paths: Option<Vec<String>>,
#[serde(default = "default_max_depth")]
pub max_depth: u32,
#[serde(default = "default_limit")]
pub limit: u32,
#[serde(default)]
pub allow_backward_links: Option<bool>,
#[serde(default)]
pub allow_external_links: Option<bool>,
#[serde(default)]
pub ignore_sitemap: Option<bool>,
#[serde(default = "default_true_option")]
pub detect_pagination: Option<bool>,
#[serde(default = "default_max_pagination_pages")]
pub max_pagination_pages: Option<u32>,
#[serde(default)]
pub use_parallel: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlResponse {
pub success: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub data: Option<Vec<Document>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub crawl_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub message: Option<String>,
}
fn default_max_depth() -> u32 {
2
}
fn default_limit() -> u32 {
100
}
fn default_max_pagination_pages() -> Option<u32> {
Some(50)
}
impl CrawlResponse {
pub fn success(data: Vec<Document>) -> Self {
Self {
success: true,
data: Some(data),
error: None,
crawl_id: None,
message: None,
}
}
pub fn error(error: String) -> Self {
Self {
success: false,
data: None,
error: Some(error),
crawl_id: None,
message: None,
}
}
pub fn started(crawl_id: String) -> Self {
Self {
success: true,
data: None,
error: None,
crawl_id: Some(crawl_id.clone()),
message: Some(format!("Crawl started with ID: {}", crawl_id)),
}
}
}
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchRequest {
pub query: String,
#[serde(default = "default_search_limit")]
pub limit: u32,
#[serde(default)]
pub scrape_results: bool,
#[serde(default)]
pub scrape_options: Option<ScrapeOptions>,
}
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ScrapeOptions {
#[serde(default = "default_formats")]
pub formats: Vec<String>,
#[serde(default = "default_true")]
pub only_main_content: bool,
#[serde(default = "default_scrape_timeout")]
pub timeout: u64,
}
#[derive(Debug, Clone, Serialize)]
pub struct SearchResponse {
pub success: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub data: Option<Vec<SearchResult>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct SearchResult {
pub title: String,
pub url: String,
pub snippet: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub content: Option<Document>,
}
fn default_search_limit() -> u32 {
10
}
fn default_scrape_timeout() -> u64 {
10000
}
impl SearchResponse {
pub fn success(data: Vec<SearchResult>) -> Self {
Self {
success: true,
data: Some(data),
error: None,
}
}
pub fn error(error: String) -> Self {
Self {
success: false,
data: None,
error: Some(error),
}
}
}
#[derive(Debug, Clone, Serialize)]
#[serde(tag = "type", rename_all = "lowercase")]
pub enum CrawlEvent {
Status {
pages_crawled: usize,
queue_size: usize,
current_url: Option<String>,
},
Document {
url: String,
title: Option<String>,
markdown: Option<String>,
metadata: Box<Metadata>,
},
Error {
url: String,
error: String,
},
Complete {
total_pages: usize,
success: usize,
errors: usize,
},
}