Skip to main content

scrapfly_sdk/result/
crawler.rs

1//! Crawler result objects — port of `sdk/go/result_crawler.go`.
2
3use std::collections::BTreeMap;
4
5use bytes::Bytes;
6use serde::Deserialize;
7
8/// Crawler status constants.
9pub mod status {
10    /// Pending — not yet picked up.
11    pub const PENDING: &str = "PENDING";
12    /// Running.
13    pub const RUNNING: &str = "RUNNING";
14    /// Done (check `is_success` for success/failure).
15    pub const DONE: &str = "DONE";
16    /// Cancelled by the user.
17    pub const CANCELLED: &str = "CANCELLED";
18}
19
20/// Response from `POST /crawl`.
21#[derive(Debug, Clone, Deserialize, Default)]
22pub struct CrawlerStartResponse {
23    /// Crawler job UUID.
24    #[serde(default)]
25    pub crawler_uuid: String,
26    /// Initial status.
27    #[serde(default)]
28    pub status: String,
29}
30
31/// Inner `state` block of [`CrawlerStatus`].
32#[derive(Debug, Clone, Deserialize, Default)]
33pub struct CrawlerState {
34    /// URLs visited.
35    #[serde(default)]
36    pub urls_visited: u64,
37    /// URLs extracted.
38    #[serde(default)]
39    pub urls_extracted: u64,
40    /// URLs failed.
41    #[serde(default)]
42    pub urls_failed: u64,
43    /// URLs skipped.
44    #[serde(default)]
45    pub urls_skipped: u64,
46    /// URLs queued.
47    #[serde(default)]
48    pub urls_to_crawl: u64,
49    /// API credit used.
50    #[serde(default)]
51    pub api_credit_used: u64,
52    /// Duration (seconds).
53    #[serde(default)]
54    pub duration: u64,
55    /// Start time (Unix seconds, null while PENDING).
56    #[serde(default)]
57    pub start_time: Option<i64>,
58    /// Stop time (Unix seconds, null until terminal).
59    #[serde(default)]
60    pub stop_time: Option<i64>,
61    /// Documented stop reason (null while running).
62    #[serde(default)]
63    pub stop_reason: Option<String>,
64}
65
66/// Response from `GET /crawl/{uuid}/status`.
67#[derive(Debug, Clone, Deserialize, Default)]
68pub struct CrawlerStatus {
69    /// Crawler UUID.
70    #[serde(default)]
71    pub crawler_uuid: String,
72    /// Status enum (`PENDING`, `RUNNING`, `DONE`, `CANCELLED`).
73    #[serde(default)]
74    pub status: String,
75    /// Whether the crawler reached a terminal state.
76    #[serde(default)]
77    pub is_finished: bool,
78    /// Success marker (nullable while running).
79    #[serde(default)]
80    pub is_success: Option<bool>,
81    /// Per-job metrics.
82    #[serde(default)]
83    pub state: CrawlerState,
84}
85
86impl CrawlerStatus {
87    /// True while still pending or running.
88    pub fn is_running(&self) -> bool {
89        self.status == status::PENDING || self.status == status::RUNNING
90    }
91    /// True when terminated successfully.
92    pub fn is_complete(&self) -> bool {
93        self.status == status::DONE && self.is_success == Some(true)
94    }
95    /// True when terminated with failure.
96    pub fn is_failed(&self) -> bool {
97        self.status == status::DONE && self.is_success == Some(false)
98    }
99    /// True when cancelled by the user.
100    pub fn is_cancelled(&self) -> bool {
101        self.status == status::CANCELLED
102    }
103}
104
105/// One entry in the streaming `urls` list.
106#[derive(Debug, Clone)]
107pub struct CrawlerUrlEntry {
108    /// URL.
109    pub url: String,
110    /// Status (visited/pending/failed/skipped) — echoed from the request.
111    pub status: String,
112    /// Reason for failure/skip (only set for `failed`/`skipped`).
113    pub reason: String,
114}
115
116/// Streaming response from `GET /crawl/{uuid}/urls`.
117#[derive(Debug, Clone, Default)]
118pub struct CrawlerUrls {
119    /// URL entries on this page.
120    pub urls: Vec<CrawlerUrlEntry>,
121    /// Page number.
122    pub page: u32,
123    /// Page size.
124    pub per_page: u32,
125}
126
127impl CrawlerUrls {
128    /// Parse a `text/plain` body into a [`CrawlerUrls`]. Mirrors
129    /// `sdk/go/result_crawler.go::parseCrawlerURLs`.
130    pub fn from_text(body: &str, status_hint: &str, page: u32, per_page: u32) -> Self {
131        let mut urls = Vec::new();
132        for raw_line in body.split('\n') {
133            let line = raw_line.trim();
134            if line.is_empty() {
135                continue;
136            }
137            if status_hint == "visited" || status_hint == "pending" {
138                urls.push(CrawlerUrlEntry {
139                    url: line.to_string(),
140                    status: status_hint.to_string(),
141                    reason: String::new(),
142                });
143                continue;
144            }
145            if let Some(idx) = line.find(',') {
146                urls.push(CrawlerUrlEntry {
147                    url: line[..idx].to_string(),
148                    status: status_hint.to_string(),
149                    reason: line[idx + 1..].to_string(),
150                });
151            } else {
152                urls.push(CrawlerUrlEntry {
153                    url: line.to_string(),
154                    status: status_hint.to_string(),
155                    reason: String::new(),
156                });
157            }
158        }
159        Self {
160            urls,
161            page,
162            per_page,
163        }
164    }
165}
166
167/// `GET /crawl/{uuid}/contents` bulk-JSON envelope.
168#[derive(Debug, Clone, Deserialize, Default)]
169pub struct CrawlerContents {
170    /// `url → format → content`. The API can emit `null` for a format that
171    /// couldn't be produced for a given URL (e.g. `extracted_data` on a page
172    /// that no template matched); the SDK flattens `null → ""` so consumers
173    /// always get a string and can check emptiness. Mirrors Go's map[string]string
174    /// zero-value semantics.
175    #[serde(default, deserialize_with = "deserialize_contents_map")]
176    pub contents: BTreeMap<String, BTreeMap<String, String>>,
177    /// Pagination links.
178    #[serde(default)]
179    pub links: CrawlerContentsLinks,
180}
181
182/// Deserialize `{url: {format: string|null}}` tolerating `null` inner values
183/// by mapping them to the empty string.
184fn deserialize_contents_map<'de, D>(
185    deserializer: D,
186) -> Result<BTreeMap<String, BTreeMap<String, String>>, D::Error>
187where
188    D: serde::Deserializer<'de>,
189{
190    let raw: BTreeMap<String, BTreeMap<String, Option<String>>> =
191        BTreeMap::deserialize(deserializer)?;
192    Ok(raw
193        .into_iter()
194        .map(|(url, by_format)| {
195            (
196                url,
197                by_format
198                    .into_iter()
199                    .map(|(fmt, body)| (fmt, body.unwrap_or_default()))
200                    .collect(),
201            )
202        })
203        .collect())
204}
205
206/// Pagination links returned with bulk contents.
207///
208/// `next`/`prev` arrive as JSON `null` when there is no adjacent page, which
209/// would reject under a plain `String` field; [`null_as_empty_string`] maps
210/// both null and absent to the empty string so the public API stays typed.
211#[derive(Debug, Clone, Deserialize, Default)]
212pub struct CrawlerContentsLinks {
213    /// Crawled URLs link.
214    #[serde(default, deserialize_with = "null_as_empty_string")]
215    pub crawled_urls: String,
216    /// Next-page link (empty when on the last page).
217    #[serde(default, deserialize_with = "null_as_empty_string")]
218    pub next: String,
219    /// Previous-page link (empty when on the first page).
220    #[serde(default, deserialize_with = "null_as_empty_string")]
221    pub prev: String,
222}
223
224/// Coerce JSON `null | absent | string` into a plain `String`, where null
225/// and absent both collapse to the empty string. Mirrors Go's `string`
226/// zero-value behavior under `encoding/json`.
227fn null_as_empty_string<'de, D>(deserializer: D) -> Result<String, D::Error>
228where
229    D: serde::Deserializer<'de>,
230{
231    Ok(Option::<String>::deserialize(deserializer)?.unwrap_or_default())
232}
233
234/// Typed content wrapper for a single crawled URL (`Crawl::read`).
235#[derive(Debug, Clone, Default)]
236pub struct CrawlContent {
237    /// URL.
238    pub url: String,
239    /// Content in the requested format.
240    pub content: String,
241    /// Parent crawler UUID.
242    pub crawl_uuid: String,
243}
244
245/// Artifact type — `warc` or `har`.
246#[derive(Debug, Clone, Copy, PartialEq, Eq)]
247pub enum CrawlerArtifactType {
248    /// WARC artifact.
249    Warc,
250    /// HAR artifact.
251    Har,
252}
253
254impl CrawlerArtifactType {
255    /// Wire-format string.
256    pub fn as_str(&self) -> &'static str {
257        match self {
258            Self::Warc => "warc",
259            Self::Har => "har",
260        }
261    }
262}
263
264/// WARC or HAR artifact downloaded from the crawler endpoint.
265#[derive(Debug, Clone)]
266pub struct CrawlerArtifact {
267    /// Artifact type.
268    pub artifact_type: CrawlerArtifactType,
269    /// Raw bytes.
270    pub data: Bytes,
271}
272
273impl CrawlerArtifact {
274    /// Write the artifact to disk.
275    pub fn save(&self, path: &std::path::Path) -> std::io::Result<()> {
276        std::fs::write(path, &self.data)
277    }
278    /// Byte length of the artifact.
279    pub fn len(&self) -> usize {
280        self.data.len()
281    }
282    /// True when empty.
283    pub fn is_empty(&self) -> bool {
284        self.data.is_empty()
285    }
286}