Skip to main content

scrapfly_sdk/
crawler.rs

1//! High-level [`Crawl`] wrapper — port of `sdk/go/crawl.go`.
2
3use std::time::{Duration, Instant};
4
5use crate::client::Client;
6use crate::config::crawler::CrawlerConfig;
7use crate::enums::CrawlerContentFormat;
8use crate::error::ScrapflyError;
9use crate::result::crawler::{
10    CrawlContent, CrawlerArtifact, CrawlerArtifactType, CrawlerContents, CrawlerStatus, CrawlerUrls,
11};
12
13/// Polling options for [`Crawl::wait`].
14#[derive(Debug, Clone)]
15pub struct WaitOptions {
16    /// How often to poll (default 5s).
17    pub poll_interval: Duration,
18    /// Optional deadline; `None` means wait forever.
19    pub max_wait: Option<Duration>,
20    /// Verbose logging (currently a no-op — reserved for future use).
21    pub verbose: bool,
22    /// Return `Ok(())` instead of `CrawlerCancelled` when the job
23    /// terminates in the CANCELLED state. Useful for the
24    /// cancel-then-wait pattern.
25    pub allow_cancelled: bool,
26}
27
28impl Default for WaitOptions {
29    fn default() -> Self {
30        Self {
31            poll_interval: Duration::from_secs(5),
32            max_wait: None,
33            verbose: false,
34            allow_cancelled: false,
35        }
36    }
37}
38
39/// High-level crawler lifecycle wrapper. Holds a borrow of the [`Client`]
40/// and caches the last status + downloaded artifacts.
41pub struct Crawl<'a> {
42    client: &'a Client,
43    config: CrawlerConfig,
44    uuid: Option<String>,
45    cached_status: Option<CrawlerStatus>,
46    cached_warc: Option<CrawlerArtifact>,
47    cached_har: Option<CrawlerArtifact>,
48}
49
50impl<'a> Crawl<'a> {
51    /// Wrap a [`CrawlerConfig`] without starting the job.
52    pub fn new(client: &'a Client, config: CrawlerConfig) -> Self {
53        Self {
54            client,
55            config,
56            uuid: None,
57            cached_status: None,
58            cached_warc: None,
59            cached_har: None,
60        }
61    }
62
63    /// Job UUID (empty string before [`Crawl::start`]).
64    pub fn uuid(&self) -> &str {
65        self.uuid.as_deref().unwrap_or("")
66    }
67
68    /// Whether `start()` has been called successfully.
69    pub fn started(&self) -> bool {
70        self.uuid.is_some()
71    }
72
73    /// Schedule the crawler job. Returns `CrawlerAlreadyStarted` on re-entry.
74    pub async fn start(&mut self) -> Result<(), ScrapflyError> {
75        if self.uuid.is_some() {
76            return Err(ScrapflyError::CrawlerAlreadyStarted);
77        }
78        let resp = self.client.start_crawl(&self.config).await?;
79        self.uuid = Some(resp.crawler_uuid);
80        Ok(())
81    }
82
83    fn uuid_required(&self) -> Result<&str, ScrapflyError> {
84        match &self.uuid {
85            Some(u) => Ok(u.as_str()),
86            None => Err(ScrapflyError::CrawlerNotStarted),
87        }
88    }
89
90    /// Fetch the status, optionally using the cached copy.
91    pub async fn status(&mut self, refresh: bool) -> Result<&CrawlerStatus, ScrapflyError> {
92        let uuid = self.uuid_required()?.to_string();
93        if refresh || self.cached_status.is_none() {
94            let s = self.client.crawl_status(&uuid).await?;
95            self.cached_status = Some(s);
96        }
97        match &self.cached_status {
98            Some(s) => Ok(s),
99            None => Err(ScrapflyError::CrawlerNotStarted),
100        }
101    }
102
103    /// Poll status until the job reaches a terminal state.
104    pub async fn wait(&mut self, opts: WaitOptions) -> Result<(), ScrapflyError> {
105        self.uuid_required()?;
106        let deadline = opts.max_wait.map(|d| Instant::now() + d);
107        loop {
108            let status = self.status(true).await?.clone();
109            if status.is_finished || status.is_cancelled() {
110                if status.is_failed() {
111                    let reason = status.state.stop_reason.clone().unwrap_or_default();
112                    return Err(ScrapflyError::CrawlerFailed(crate::error::ApiError {
113                        message: format!("crawl failed (stop_reason={})", reason),
114                        ..Default::default()
115                    }));
116                }
117                if status.is_cancelled() {
118                    if opts.allow_cancelled {
119                        return Ok(());
120                    }
121                    return Err(ScrapflyError::CrawlerCancelled);
122                }
123                return Ok(());
124            }
125            if let Some(d) = deadline {
126                if Instant::now() + opts.poll_interval > d {
127                    return Err(ScrapflyError::CrawlerTimeout);
128                }
129            }
130            tokio::time::sleep(opts.poll_interval).await;
131        }
132    }
133
134    /// Cancel the running crawl. No-op if already finished server-side.
135    pub async fn cancel(&self) -> Result<(), ScrapflyError> {
136        let uuid = self.uuid_required()?;
137        self.client.crawl_cancel(uuid).await
138    }
139
140    /// Paginated URL listing.
141    pub async fn urls(
142        &self,
143        status_filter: Option<&str>,
144        page: u32,
145        per_page: u32,
146    ) -> Result<CrawlerUrls, ScrapflyError> {
147        let uuid = self.uuid_required()?;
148        self.client
149            .crawl_urls(uuid, status_filter, page, per_page)
150            .await
151    }
152
153    /// Read a single URL's content and wrap it in a [`CrawlContent`]. Returns
154    /// `Ok(None)` when the URL isn't part of the crawl.
155    pub async fn read(
156        &self,
157        target_url: &str,
158        format: CrawlerContentFormat,
159    ) -> Result<Option<CrawlContent>, ScrapflyError> {
160        let uuid = self.uuid_required()?.to_string();
161        match self
162            .client
163            .crawl_contents_plain(&uuid, target_url, format)
164            .await
165        {
166            Ok(content) => Ok(Some(CrawlContent {
167                url: target_url.to_string(),
168                content,
169                crawl_uuid: uuid,
170            })),
171            Err(ScrapflyError::ApiClient(e)) if e.http_status == 404 => Ok(None),
172            Err(ScrapflyError::CrawlerFailed(e)) if e.http_status == 404 => Ok(None),
173            Err(e) => Err(e),
174        }
175    }
176
177    /// Read the raw content string (empty string when URL not in crawl).
178    pub async fn read_string(
179        &self,
180        target_url: &str,
181        format: CrawlerContentFormat,
182    ) -> Result<String, ScrapflyError> {
183        Ok(self
184            .read(target_url, format)
185            .await?
186            .map(|c| c.content)
187            .unwrap_or_default())
188    }
189
190    /// Batch read up to 100 URLs.
191    pub async fn read_batch(
192        &self,
193        urls: &[String],
194        formats: &[CrawlerContentFormat],
195    ) -> Result<
196        std::collections::BTreeMap<String, std::collections::BTreeMap<String, String>>,
197        ScrapflyError,
198    > {
199        let uuid = self.uuid_required()?;
200        self.client.crawl_contents_batch(uuid, urls, formats).await
201    }
202
203    /// Bulk JSON contents.
204    pub async fn contents(
205        &self,
206        format: CrawlerContentFormat,
207        limit: Option<u32>,
208        offset: Option<u32>,
209    ) -> Result<CrawlerContents, ScrapflyError> {
210        let uuid = self.uuid_required()?;
211        self.client
212            .crawl_contents_json(uuid, format, limit, offset)
213            .await
214    }
215
216    /// Download + cache the WARC artifact.
217    pub async fn warc(&mut self) -> Result<&CrawlerArtifact, ScrapflyError> {
218        let uuid = self.uuid_required()?.to_string();
219        if self.cached_warc.is_none() {
220            let a = self
221                .client
222                .crawl_artifact(&uuid, CrawlerArtifactType::Warc)
223                .await?;
224            self.cached_warc = Some(a);
225        }
226        match &self.cached_warc {
227            Some(a) => Ok(a),
228            None => Err(ScrapflyError::Config(
229                "warc cache unexpectedly empty".into(),
230            )),
231        }
232    }
233
234    /// Download + cache the HAR artifact.
235    pub async fn har(&mut self) -> Result<&CrawlerArtifact, ScrapflyError> {
236        let uuid = self.uuid_required()?.to_string();
237        if self.cached_har.is_none() {
238            let a = self
239                .client
240                .crawl_artifact(&uuid, CrawlerArtifactType::Har)
241                .await?;
242            self.cached_har = Some(a);
243        }
244        match &self.cached_har {
245            Some(a) => Ok(a),
246            None => Err(ScrapflyError::Config("har cache unexpectedly empty".into())),
247        }
248    }
249}