spidery/
scrape.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6use crate::{document::Document, SpideryApp, SpideryError, API_VERSION};
7
8#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
9pub enum ScrapeFormats {
10    /// Will result in a copy of the Markdown content of the page.
11    #[serde(rename = "markdown")]
12    Markdown,
13
14    /// Will result in a copy of the filtered, content-only HTML of the page.
15    #[serde(rename = "html")]
16    HTML,
17
18    /// Will result in a copy of the raw HTML of the page.
19    #[serde(rename = "rawHtml")]
20    RawHTML,
21
22    /// Will result in a Vec of URLs found on the page.
23    #[serde(rename = "links")]
24    Links,
25
26    /// Will result in a URL to a screenshot of the page.
27    ///
28    /// Can not be used in conjunction with `ScrapeFormats::ScreenshotFullPage`.
29    #[serde(rename = "screenshot")]
30    Screenshot,
31
32    /// Will result in a URL to a full-page screenshot of the page.
33    ///
34    /// Can not be used in conjunction with `ScrapeFormats::Screenshot`.
35    #[serde(rename = "screenshot@fullPage")]
36    ScreenshotFullPage,
37
38    /// Will result in the results of an LLM extraction.
39    ///
40    /// See `ScrapeOptions.extract` for more options.
41    #[serde(rename = "extract")]
42    Extract,
43}
44
45#[serde_with::skip_serializing_none]
46#[derive(Deserialize, Serialize, Debug, Default)]
47#[serde(rename_all = "camelCase")]
48pub struct ExtractOptions {
49    /// Schema the output should adhere to, provided in JSON Schema format.
50    pub schema: Option<Value>,
51
52    pub system_prompt: Option<String>,
53
54    /// Extraction prompt to send to the LLM agent along with the page content.
55    pub prompt: Option<String>,
56}
57
58#[serde_with::skip_serializing_none]
59#[derive(Deserialize, Serialize, Debug, Default)]
60#[serde(rename_all = "camelCase")]
61pub struct ScrapeOptions {
62    /// Formats to extract from the page. (default: `[ Markdown ]`)
63    pub formats: Option<Vec<ScrapeFormats>>,
64
65    /// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`)
66    pub only_main_content: Option<bool>,
67
68    /// HTML tags to exclusively include.
69    ///
70    /// For example, if you pass `div`, you will only get content from `<div>`s and their children.
71    pub include_tags: Option<Vec<String>>,
72
73    /// HTML tags to exclude.
74    ///
75    /// For example, if you pass `img`, you will never get image URLs in your results.
76    pub exclude_tags: Option<Vec<String>>,
77
78    /// Additional HTTP headers to use when loading the page.
79    pub headers: Option<HashMap<String, String>>,
80
81    // Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`)
82    pub wait_for: Option<u32>,
83
84    // Timeout before returning an error, in milliseconds. (default: `60000`)
85    pub timeout: Option<u32>,
86
87    /// Extraction options, to be used in conjunction with `ScrapeFormats::Extract`.
88    pub extract: Option<ExtractOptions>,
89}
90
91#[derive(Deserialize, Serialize, Debug, Default)]
92#[serde(rename_all = "camelCase")]
93struct ScrapeRequestBody {
94    url: String,
95
96    #[serde(flatten)]
97    options: ScrapeOptions,
98}
99
100#[derive(Deserialize, Serialize, Debug, Default)]
101#[serde(rename_all = "camelCase")]
102struct ScrapeResponse {
103    /// This will always be `true` due to `SpideryApp::handle_response`.
104    /// No need to expose.
105    success: bool,
106
107    /// The resulting document.
108    data: Document,
109}
110
111impl SpideryApp {
112    /// Scrapes a URL using the Spidery API.
113    pub async fn scrape_url(
114        &self,
115        url: impl AsRef<str>,
116        options: impl Into<Option<ScrapeOptions>>,
117    ) -> Result<Document, SpideryError> {
118        let body = ScrapeRequestBody {
119            url: url.as_ref().to_string(),
120            options: options.into().unwrap_or_default(),
121        };
122
123        let headers = self.prepare_headers(None);
124
125        let response = self
126            .client
127            .post(&format!("{}{}/scrape", self.api_url, API_VERSION))
128            .headers(headers)
129            .json(&body)
130            .send()
131            .await
132            .map_err(|e| SpideryError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
133
134        let response = self
135            .handle_response::<ScrapeResponse>(response, "scrape URL")
136            .await?;
137
138        Ok(response.data)
139    }
140}