1use std::time::{Duration, Instant};
4
5use crate::client::Client;
6use crate::config::crawler::CrawlerConfig;
7use crate::enums::CrawlerContentFormat;
8use crate::error::ScrapflyError;
9use crate::result::crawler::{
10 CrawlContent, CrawlerArtifact, CrawlerArtifactType, CrawlerContents, CrawlerStatus, CrawlerUrls,
11};
12
13#[derive(Debug, Clone)]
15pub struct WaitOptions {
16 pub poll_interval: Duration,
18 pub max_wait: Option<Duration>,
20 pub verbose: bool,
22 pub allow_cancelled: bool,
26}
27
28impl Default for WaitOptions {
29 fn default() -> Self {
30 Self {
31 poll_interval: Duration::from_secs(5),
32 max_wait: None,
33 verbose: false,
34 allow_cancelled: false,
35 }
36 }
37}
38
39pub struct Crawl<'a> {
42 client: &'a Client,
43 config: CrawlerConfig,
44 uuid: Option<String>,
45 cached_status: Option<CrawlerStatus>,
46 cached_warc: Option<CrawlerArtifact>,
47 cached_har: Option<CrawlerArtifact>,
48}
49
50impl<'a> Crawl<'a> {
51 pub fn new(client: &'a Client, config: CrawlerConfig) -> Self {
53 Self {
54 client,
55 config,
56 uuid: None,
57 cached_status: None,
58 cached_warc: None,
59 cached_har: None,
60 }
61 }
62
63 pub fn uuid(&self) -> &str {
65 self.uuid.as_deref().unwrap_or("")
66 }
67
68 pub fn started(&self) -> bool {
70 self.uuid.is_some()
71 }
72
73 pub async fn start(&mut self) -> Result<(), ScrapflyError> {
75 if self.uuid.is_some() {
76 return Err(ScrapflyError::CrawlerAlreadyStarted);
77 }
78 let resp = self.client.start_crawl(&self.config).await?;
79 self.uuid = Some(resp.crawler_uuid);
80 Ok(())
81 }
82
83 fn uuid_required(&self) -> Result<&str, ScrapflyError> {
84 match &self.uuid {
85 Some(u) => Ok(u.as_str()),
86 None => Err(ScrapflyError::CrawlerNotStarted),
87 }
88 }
89
90 pub async fn status(&mut self, refresh: bool) -> Result<&CrawlerStatus, ScrapflyError> {
92 let uuid = self.uuid_required()?.to_string();
93 if refresh || self.cached_status.is_none() {
94 let s = self.client.crawl_status(&uuid).await?;
95 self.cached_status = Some(s);
96 }
97 match &self.cached_status {
98 Some(s) => Ok(s),
99 None => Err(ScrapflyError::CrawlerNotStarted),
100 }
101 }
102
103 pub async fn wait(&mut self, opts: WaitOptions) -> Result<(), ScrapflyError> {
105 self.uuid_required()?;
106 let deadline = opts.max_wait.map(|d| Instant::now() + d);
107 loop {
108 let status = self.status(true).await?.clone();
109 if status.is_finished || status.is_cancelled() {
110 if status.is_failed() {
111 let reason = status.state.stop_reason.clone().unwrap_or_default();
112 return Err(ScrapflyError::CrawlerFailed(crate::error::ApiError {
113 message: format!("crawl failed (stop_reason={})", reason),
114 ..Default::default()
115 }));
116 }
117 if status.is_cancelled() {
118 if opts.allow_cancelled {
119 return Ok(());
120 }
121 return Err(ScrapflyError::CrawlerCancelled);
122 }
123 return Ok(());
124 }
125 if let Some(d) = deadline {
126 if Instant::now() + opts.poll_interval > d {
127 return Err(ScrapflyError::CrawlerTimeout);
128 }
129 }
130 tokio::time::sleep(opts.poll_interval).await;
131 }
132 }
133
134 pub async fn cancel(&self) -> Result<(), ScrapflyError> {
136 let uuid = self.uuid_required()?;
137 self.client.crawl_cancel(uuid).await
138 }
139
140 pub async fn urls(
142 &self,
143 status_filter: Option<&str>,
144 page: u32,
145 per_page: u32,
146 ) -> Result<CrawlerUrls, ScrapflyError> {
147 let uuid = self.uuid_required()?;
148 self.client
149 .crawl_urls(uuid, status_filter, page, per_page)
150 .await
151 }
152
153 pub async fn read(
156 &self,
157 target_url: &str,
158 format: CrawlerContentFormat,
159 ) -> Result<Option<CrawlContent>, ScrapflyError> {
160 let uuid = self.uuid_required()?.to_string();
161 match self
162 .client
163 .crawl_contents_plain(&uuid, target_url, format)
164 .await
165 {
166 Ok(content) => Ok(Some(CrawlContent {
167 url: target_url.to_string(),
168 content,
169 crawl_uuid: uuid,
170 })),
171 Err(ScrapflyError::ApiClient(e)) if e.http_status == 404 => Ok(None),
172 Err(ScrapflyError::CrawlerFailed(e)) if e.http_status == 404 => Ok(None),
173 Err(e) => Err(e),
174 }
175 }
176
177 pub async fn read_string(
179 &self,
180 target_url: &str,
181 format: CrawlerContentFormat,
182 ) -> Result<String, ScrapflyError> {
183 Ok(self
184 .read(target_url, format)
185 .await?
186 .map(|c| c.content)
187 .unwrap_or_default())
188 }
189
190 pub async fn read_batch(
192 &self,
193 urls: &[String],
194 formats: &[CrawlerContentFormat],
195 ) -> Result<
196 std::collections::BTreeMap<String, std::collections::BTreeMap<String, String>>,
197 ScrapflyError,
198 > {
199 let uuid = self.uuid_required()?;
200 self.client.crawl_contents_batch(uuid, urls, formats).await
201 }
202
203 pub async fn contents(
205 &self,
206 format: CrawlerContentFormat,
207 limit: Option<u32>,
208 offset: Option<u32>,
209 ) -> Result<CrawlerContents, ScrapflyError> {
210 let uuid = self.uuid_required()?;
211 self.client
212 .crawl_contents_json(uuid, format, limit, offset)
213 .await
214 }
215
216 pub async fn warc(&mut self) -> Result<&CrawlerArtifact, ScrapflyError> {
218 let uuid = self.uuid_required()?.to_string();
219 if self.cached_warc.is_none() {
220 let a = self
221 .client
222 .crawl_artifact(&uuid, CrawlerArtifactType::Warc)
223 .await?;
224 self.cached_warc = Some(a);
225 }
226 match &self.cached_warc {
227 Some(a) => Ok(a),
228 None => Err(ScrapflyError::Config(
229 "warc cache unexpectedly empty".into(),
230 )),
231 }
232 }
233
234 pub async fn har(&mut self) -> Result<&CrawlerArtifact, ScrapflyError> {
236 let uuid = self.uuid_required()?.to_string();
237 if self.cached_har.is_none() {
238 let a = self
239 .client
240 .crawl_artifact(&uuid, CrawlerArtifactType::Har)
241 .await?;
242 self.cached_har = Some(a);
243 }
244 match &self.cached_har {
245 Some(a) => Ok(a),
246 None => Err(ScrapflyError::Config("har cache unexpectedly empty".into())),
247 }
248 }
249}