1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6use crate::{document::Document, SpideryApp, SpideryError, API_VERSION};
7
8#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
9pub enum ScrapeFormats {
10 #[serde(rename = "markdown")]
12 Markdown,
13
14 #[serde(rename = "html")]
16 HTML,
17
18 #[serde(rename = "rawHtml")]
20 RawHTML,
21
22 #[serde(rename = "links")]
24 Links,
25
26 #[serde(rename = "screenshot")]
30 Screenshot,
31
32 #[serde(rename = "screenshot@fullPage")]
36 ScreenshotFullPage,
37
38 #[serde(rename = "extract")]
42 Extract,
43}
44
45#[serde_with::skip_serializing_none]
46#[derive(Deserialize, Serialize, Debug, Default)]
47#[serde(rename_all = "camelCase")]
48pub struct ExtractOptions {
49 pub schema: Option<Value>,
51
52 pub system_prompt: Option<String>,
53
54 pub prompt: Option<String>,
56}
57
58#[serde_with::skip_serializing_none]
59#[derive(Deserialize, Serialize, Debug, Default)]
60#[serde(rename_all = "camelCase")]
61pub struct ScrapeOptions {
62 pub formats: Option<Vec<ScrapeFormats>>,
64
65 pub only_main_content: Option<bool>,
67
68 pub include_tags: Option<Vec<String>>,
72
73 pub exclude_tags: Option<Vec<String>>,
77
78 pub headers: Option<HashMap<String, String>>,
80
81 pub wait_for: Option<u32>,
83
84 pub timeout: Option<u32>,
86
87 pub extract: Option<ExtractOptions>,
89}
90
91#[derive(Deserialize, Serialize, Debug, Default)]
92#[serde(rename_all = "camelCase")]
93struct ScrapeRequestBody {
94 url: String,
95
96 #[serde(flatten)]
97 options: ScrapeOptions,
98}
99
100#[derive(Deserialize, Serialize, Debug, Default)]
101#[serde(rename_all = "camelCase")]
102struct ScrapeResponse {
103 success: bool,
106
107 data: Document,
109}
110
111impl SpideryApp {
112 pub async fn scrape_url(
114 &self,
115 url: impl AsRef<str>,
116 options: impl Into<Option<ScrapeOptions>>,
117 ) -> Result<Document, SpideryError> {
118 let body = ScrapeRequestBody {
119 url: url.as_ref().to_string(),
120 options: options.into().unwrap_or_default(),
121 };
122
123 let headers = self.prepare_headers(None);
124
125 let response = self
126 .client
127 .post(&format!("{}{}/scrape", self.api_url, API_VERSION))
128 .headers(headers)
129 .json(&body)
130 .send()
131 .await
132 .map_err(|e| SpideryError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
133
134 let response = self
135 .handle_response::<ScrapeResponse>(response, "scrape URL")
136 .await?;
137
138 Ok(response.data)
139 }
140}