1use std::collections::BTreeMap;
4
5use crate::enums::{ExtractionModel, Format, FormatOption, HttpMethod, ProxyPool, ScreenshotFlag};
6use crate::error::ScrapflyError;
7
8use super::url_safe_b64_encode;
9
10#[derive(Debug, Clone, Default)]
14pub struct ScrapeConfig {
15 pub url: String,
17 pub method: Option<HttpMethod>,
19 pub body: Option<String>,
21 pub headers: BTreeMap<String, String>,
23 pub cookies: BTreeMap<String, String>,
25 pub country: Option<String>,
27 pub proxy_pool: Option<ProxyPool>,
29 pub render_js: bool,
31 pub asp: bool,
33 pub cache: bool,
35 pub cache_ttl: Option<u32>,
37 pub cache_clear: bool,
39 pub timeout: Option<u32>,
41 pub cost_budget: Option<u32>,
45 pub retry: Option<bool>,
47 pub session: Option<String>,
49 pub session_sticky_proxy: bool,
51 pub tags: Vec<String>,
53 pub webhook: Option<String>,
55 pub debug: bool,
57 pub ssl: bool,
59 pub dns: bool,
61 pub correlation_id: Option<String>,
63 pub format: Option<Format>,
65 pub format_options: Vec<FormatOption>,
67 pub extraction_template: Option<String>,
69 pub extraction_ephemeral_template: Option<serde_json::Value>,
71 pub extraction_prompt: Option<String>,
73 pub extraction_model: Option<ExtractionModel>,
75 pub wait_for_selector: Option<String>,
77 pub rendering_wait: Option<u32>,
79 pub auto_scroll: bool,
81 pub screenshots: BTreeMap<String, String>,
83 pub screenshot_flags: Vec<ScreenshotFlag>,
85 pub js: Option<String>,
87 pub js_scenario: Option<serde_json::Value>,
89 pub os: Option<String>,
91 pub lang: Vec<String>,
93 pub browser_brand: Option<String>,
95 pub geolocation: Option<String>,
97 pub rendering_stage: Option<String>,
99 pub proxified_response: bool,
103}
104
105impl ScrapeConfig {
106 pub fn builder(url: impl Into<String>) -> ScrapeConfigBuilder {
108 ScrapeConfigBuilder {
109 cfg: ScrapeConfig {
110 url: url.into(),
111 ..Default::default()
112 },
113 }
114 }
115
116 pub fn to_query_pairs(&self) -> Result<Vec<(String, String)>, ScrapflyError> {
120 if self.url.is_empty() {
121 return Err(ScrapflyError::Config("url is required".into()));
122 }
123
124 let mut out: Vec<(String, String)> = Vec::new();
125 out.push(("url".into(), self.url.clone()));
126
127 if let Some(country) = &self.country {
128 out.push(("country".into(), country.to_lowercase()));
129 }
130 if let Some(pool) = &self.proxy_pool {
131 out.push(("proxy_pool".into(), pool.as_str().into()));
132 }
133
134 if self.render_js {
135 out.push(("render_js".into(), "true".into()));
136 if let Some(sel) = &self.wait_for_selector {
137 out.push(("wait_for_selector".into(), sel.clone()));
138 }
139 if let Some(wait) = self.rendering_wait {
140 out.push(("rendering_wait".into(), wait.to_string()));
141 }
142 if let Some(ref geo) = self.geolocation {
143 out.push(("geolocation".into(), geo.clone()));
144 }
145 if let Some(ref stage) = self.rendering_stage {
146 if stage != "complete" {
147 out.push(("rendering_stage".into(), stage.clone()));
148 }
149 }
150 if self.auto_scroll {
151 out.push(("auto_scroll".into(), "true".into()));
152 }
153 if let Some(js) = &self.js {
154 out.push(("js".into(), url_safe_b64_encode(js)));
155 }
156 if let Some(sc) = &self.js_scenario {
157 let as_str = serde_json::to_string(sc)?;
158 out.push(("js_scenario".into(), url_safe_b64_encode(&as_str)));
159 }
160 for (name, value) in &self.screenshots {
161 if value.is_empty() {
162 return Err(ScrapflyError::Config(format!(
163 "screenshots[{}] requires either a selector or 'fullpage'",
164 name
165 )));
166 }
167 out.push((format!("screenshots[{}]", name), value.clone()));
168 }
169 if !self.screenshot_flags.is_empty() {
170 let joined = self
171 .screenshot_flags
172 .iter()
173 .map(|f| f.as_str())
174 .collect::<Vec<_>>()
175 .join(",");
176 out.push(("screenshot_flags".into(), joined));
177 }
178 }
179
180 if self.asp {
181 out.push(("asp".into(), "true".into()));
182 }
183 if self.retry == Some(false) {
184 out.push(("retry".into(), "false".into()));
185 }
186 if self.cache {
187 out.push(("cache".into(), "true".into()));
188 if let Some(ttl) = self.cache_ttl {
189 out.push(("cache_ttl".into(), ttl.to_string()));
190 }
191 if self.cache_clear {
192 out.push(("cache_clear".into(), "true".into()));
193 }
194 }
195 if let Some(timeout) = self.timeout {
196 out.push(("timeout".into(), timeout.to_string()));
197 }
198 if let Some(budget) = self.cost_budget {
199 out.push(("cost_budget".into(), budget.to_string()));
200 }
201 if self.debug {
202 out.push(("debug".into(), "true".into()));
203 }
204 if self.ssl {
205 out.push(("ssl".into(), "true".into()));
206 }
207 if self.dns {
208 out.push(("dns".into(), "true".into()));
209 }
210 if let Some(cid) = &self.correlation_id {
211 out.push(("correlation_id".into(), cid.clone()));
212 }
213 if !self.tags.is_empty() {
214 out.push(("tags".into(), self.tags.join(",")));
215 }
216 if let Some(wh) = &self.webhook {
217 out.push(("webhook_name".into(), wh.clone()));
218 }
219 if let Some(session) = &self.session {
220 out.push(("session".into(), session.clone()));
221 if self.session_sticky_proxy {
222 out.push(("session_sticky_proxy".into(), "true".into()));
223 }
224 }
225 if let Some(os) = &self.os {
226 out.push(("os".into(), os.clone()));
227 }
228 if !self.lang.is_empty() {
229 out.push(("lang".into(), self.lang.join(",")));
230 }
231 if let Some(bb) = &self.browser_brand {
232 out.push(("browser_brand".into(), bb.clone()));
233 }
234 if self.proxified_response {
235 out.push(("proxified_response".into(), "true".into()));
236 }
237
238 if let Some(format) = self.format {
239 let mut val = format.as_str().to_string();
240 if !self.format_options.is_empty() {
241 val.push(':');
242 val.push_str(
243 &self
244 .format_options
245 .iter()
246 .map(|f| f.as_str())
247 .collect::<Vec<_>>()
248 .join(","),
249 );
250 }
251 out.push(("format".into(), val));
252 }
253
254 if let Some(tpl) = &self.extraction_template {
256 out.push(("extraction_template".into(), tpl.clone()));
257 } else if let Some(tpl) = &self.extraction_ephemeral_template {
258 let as_str = serde_json::to_string(tpl)?;
259 out.push((
260 "extraction_template".into(),
261 format!("ephemeral:{}", url_safe_b64_encode(&as_str)),
262 ));
263 } else if let Some(prompt) = &self.extraction_prompt {
264 out.push(("extraction_prompt".into(), prompt.clone()));
265 } else if let Some(model) = self.extraction_model {
266 out.push(("extraction_model".into(), model.as_str().into()));
267 }
268
269 for (k, v) in &self.headers {
271 if k.is_empty() || v.is_empty() {
272 return Err(ScrapflyError::Config(
273 "headers key and value cannot be empty".into(),
274 ));
275 }
276 out.push((format!("headers[{}]", k.to_lowercase()), v.clone()));
277 }
278
279 if !self.cookies.is_empty() {
281 let cookie_str = self
282 .cookies
283 .iter()
284 .map(|(k, v)| format!("{}={}", k, v))
285 .collect::<Vec<_>>()
286 .join("; ");
287 let existing = out
289 .iter()
290 .position(|(k, _)| k.eq_ignore_ascii_case("headers[cookie]"));
291 match existing {
292 Some(idx) => {
293 let existing_val = out[idx].1.clone();
294 out[idx].1 = format!("{}; {}", existing_val, cookie_str);
295 }
296 None => out.push(("headers[cookie]".into(), cookie_str)),
297 }
298 }
299
300 Ok(out)
301 }
302}
303
304#[derive(Debug, Clone)]
306pub struct ScrapeConfigBuilder {
307 cfg: ScrapeConfig,
308}
309
310impl ScrapeConfigBuilder {
311 pub fn method(mut self, m: HttpMethod) -> Self {
313 self.cfg.method = Some(m);
314 self
315 }
316 pub fn body(mut self, b: impl Into<String>) -> Self {
318 self.cfg.body = Some(b.into());
319 self
320 }
321 pub fn header(mut self, k: impl Into<String>, v: impl Into<String>) -> Self {
323 self.cfg.headers.insert(k.into(), v.into());
324 self
325 }
326 pub fn headers(mut self, headers: BTreeMap<String, String>) -> Self {
328 self.cfg.headers = headers;
329 self
330 }
331 pub fn cookie(mut self, k: impl Into<String>, v: impl Into<String>) -> Self {
333 self.cfg.cookies.insert(k.into(), v.into());
334 self
335 }
336 pub fn country(mut self, c: impl Into<String>) -> Self {
338 self.cfg.country = Some(c.into());
339 self
340 }
341 pub fn proxy_pool(mut self, p: ProxyPool) -> Self {
343 self.cfg.proxy_pool = Some(p);
344 self
345 }
346 pub fn render_js(mut self, v: bool) -> Self {
348 self.cfg.render_js = v;
349 self
350 }
351 pub fn asp(mut self, v: bool) -> Self {
353 self.cfg.asp = v;
354 self
355 }
356 pub fn cache(mut self, v: bool) -> Self {
358 self.cfg.cache = v;
359 self
360 }
361 pub fn cache_ttl(mut self, v: u32) -> Self {
363 self.cfg.cache_ttl = Some(v);
364 self
365 }
366 pub fn cache_clear(mut self, v: bool) -> Self {
368 self.cfg.cache_clear = v;
369 self
370 }
371 pub fn cost_budget(mut self, v: u32) -> Self {
376 self.cfg.cost_budget = Some(v);
377 self
378 }
379 pub fn timeout(mut self, v: u32) -> Self {
381 self.cfg.timeout = Some(v);
382 self
383 }
384 pub fn retry(mut self, v: bool) -> Self {
386 self.cfg.retry = Some(v);
387 self
388 }
389 pub fn session(mut self, v: impl Into<String>) -> Self {
391 self.cfg.session = Some(v.into());
392 self
393 }
394 pub fn session_sticky_proxy(mut self, v: bool) -> Self {
396 self.cfg.session_sticky_proxy = v;
397 self
398 }
399 pub fn tag(mut self, v: impl Into<String>) -> Self {
401 self.cfg.tags.push(v.into());
402 self
403 }
404 pub fn tags(mut self, v: Vec<String>) -> Self {
406 self.cfg.tags = v;
407 self
408 }
409 pub fn webhook(mut self, v: impl Into<String>) -> Self {
411 self.cfg.webhook = Some(v.into());
412 self
413 }
414 pub fn debug(mut self, v: bool) -> Self {
416 self.cfg.debug = v;
417 self
418 }
419 pub fn ssl(mut self, v: bool) -> Self {
421 self.cfg.ssl = v;
422 self
423 }
424 pub fn dns(mut self, v: bool) -> Self {
426 self.cfg.dns = v;
427 self
428 }
429 pub fn correlation_id(mut self, v: impl Into<String>) -> Self {
431 self.cfg.correlation_id = Some(v.into());
432 self
433 }
434 pub fn format(mut self, v: Format) -> Self {
436 self.cfg.format = Some(v);
437 self
438 }
439 pub fn format_option(mut self, v: FormatOption) -> Self {
441 self.cfg.format_options.push(v);
442 self
443 }
444 pub fn extraction_template(mut self, v: impl Into<String>) -> Self {
446 self.cfg.extraction_template = Some(v.into());
447 self
448 }
449 pub fn extraction_ephemeral_template(mut self, v: serde_json::Value) -> Self {
451 self.cfg.extraction_ephemeral_template = Some(v);
452 self
453 }
454 pub fn extraction_prompt(mut self, v: impl Into<String>) -> Self {
456 self.cfg.extraction_prompt = Some(v.into());
457 self
458 }
459 pub fn extraction_model(mut self, v: ExtractionModel) -> Self {
461 self.cfg.extraction_model = Some(v);
462 self
463 }
464 pub fn wait_for_selector(mut self, v: impl Into<String>) -> Self {
466 self.cfg.wait_for_selector = Some(v.into());
467 self
468 }
469 pub fn rendering_wait(mut self, v: u32) -> Self {
471 self.cfg.rendering_wait = Some(v);
472 self
473 }
474 pub fn auto_scroll(mut self, v: bool) -> Self {
476 self.cfg.auto_scroll = v;
477 self
478 }
479 pub fn screenshot(mut self, name: impl Into<String>, selector: impl Into<String>) -> Self {
481 self.cfg.screenshots.insert(name.into(), selector.into());
482 self
483 }
484 pub fn screenshot_flag(mut self, v: ScreenshotFlag) -> Self {
486 self.cfg.screenshot_flags.push(v);
487 self
488 }
489 pub fn js(mut self, v: impl Into<String>) -> Self {
491 self.cfg.js = Some(v.into());
492 self
493 }
494 pub fn js_scenario(mut self, v: serde_json::Value) -> Self {
496 self.cfg.js_scenario = Some(v);
497 self
498 }
499 pub fn os(mut self, v: impl Into<String>) -> Self {
501 self.cfg.os = Some(v.into());
502 self
503 }
504 pub fn lang(mut self, v: impl Into<String>) -> Self {
506 self.cfg.lang.push(v.into());
507 self
508 }
509 pub fn browser_brand(mut self, v: impl Into<String>) -> Self {
511 self.cfg.browser_brand = Some(v.into());
512 self
513 }
514 pub fn geolocation(mut self, v: impl Into<String>) -> Self {
517 self.cfg.geolocation = Some(v.into());
518 self
519 }
520 pub fn rendering_stage(mut self, v: impl Into<String>) -> Self {
522 self.cfg.rendering_stage = Some(v.into());
523 self
524 }
525 pub fn proxified_response(mut self) -> Self {
527 self.cfg.proxified_response = true;
528 self
529 }
530
531 pub fn build(self) -> Result<ScrapeConfig, ScrapflyError> {
534 let cfg = self.cfg;
535 let count = [
536 cfg.extraction_template.is_some(),
537 cfg.extraction_ephemeral_template.is_some(),
538 cfg.extraction_prompt.is_some(),
539 cfg.extraction_model.is_some(),
540 ]
541 .iter()
542 .filter(|x| **x)
543 .count();
544 if count > 1 {
545 return Err(ScrapflyError::Config(
546 "extraction_template, extraction_ephemeral_template, extraction_prompt and extraction_model are mutually exclusive"
547 .into(),
548 ));
549 }
550 Ok(cfg)
551 }
552}