Skip to main content

scrapfly_sdk/config/
scrape.rs

1//! Scrape endpoint configuration — ported from `sdk/go/config_scrape.go`.
2
3use std::collections::BTreeMap;
4
5use crate::enums::{ExtractionModel, Format, FormatOption, HttpMethod, ProxyPool, ScreenshotFlag};
6use crate::error::ScrapflyError;
7
8use super::url_safe_b64_encode;
9
10/// Configuration for a single `POST /scrape` request.
11///
12/// Construct via [`ScrapeConfig::builder`].
13#[derive(Debug, Clone, Default)]
14pub struct ScrapeConfig {
15    /// Target URL (required).
16    pub url: String,
17    /// HTTP method; defaults to `GET`.
18    pub method: Option<HttpMethod>,
19    /// Raw request body.
20    pub body: Option<String>,
21    /// Request headers (sent as `headers[key]=value`).
22    pub headers: BTreeMap<String, String>,
23    /// Cookies (merged into `headers[cookie]`).
24    pub cookies: BTreeMap<String, String>,
25    /// Proxy country.
26    pub country: Option<String>,
27    /// Proxy pool.
28    pub proxy_pool: Option<ProxyPool>,
29    /// Enable JavaScript rendering.
30    pub render_js: bool,
31    /// Enable Anti-Scraping Protection bypass.
32    pub asp: bool,
33    /// Enable cache.
34    pub cache: bool,
35    /// Cache TTL (seconds).
36    pub cache_ttl: Option<u32>,
37    /// Force cache refresh.
38    pub cache_clear: bool,
39    /// Timeout in milliseconds.
40    pub timeout: Option<u32>,
41    /// Maximum API credit cost the caller is willing to spend on this
42    /// request. If the server's pre-flight estimate exceeds this budget the
43    /// request is rejected before execution.
44    pub cost_budget: Option<u32>,
45    /// Enable automatic retries.
46    pub retry: Option<bool>,
47    /// Session name.
48    pub session: Option<String>,
49    /// Sticky-proxy inside the session.
50    pub session_sticky_proxy: bool,
51    /// Custom tags.
52    pub tags: Vec<String>,
53    /// Webhook name.
54    pub webhook: Option<String>,
55    /// Debug mode.
56    pub debug: bool,
57    /// Capture SSL details.
58    pub ssl: bool,
59    /// Capture DNS details.
60    pub dns: bool,
61    /// Correlation ID.
62    pub correlation_id: Option<String>,
63    /// Output format.
64    pub format: Option<Format>,
65    /// Format options.
66    pub format_options: Vec<FormatOption>,
67    /// Saved extraction template name.
68    pub extraction_template: Option<String>,
69    /// Inline (ephemeral) extraction template as JSON value.
70    pub extraction_ephemeral_template: Option<serde_json::Value>,
71    /// AI extraction prompt.
72    pub extraction_prompt: Option<String>,
73    /// Extraction model.
74    pub extraction_model: Option<ExtractionModel>,
75    /// Wait for CSS selector (requires `render_js`).
76    pub wait_for_selector: Option<String>,
77    /// Extra wait after page load, milliseconds.
78    pub rendering_wait: Option<u32>,
79    /// Auto-scroll to load lazy content.
80    pub auto_scroll: bool,
81    /// Named screenshots (name → selector, or "fullpage").
82    pub screenshots: BTreeMap<String, String>,
83    /// Screenshot flags.
84    pub screenshot_flags: Vec<ScreenshotFlag>,
85    /// Inline JavaScript code (base64url-encoded on the wire).
86    pub js: Option<String>,
87    /// JS scenario (serialized as JSON then base64url-encoded).
88    pub js_scenario: Option<serde_json::Value>,
89    /// OS fingerprint hint.
90    pub os: Option<String>,
91    /// Accept-Language values.
92    pub lang: Vec<String>,
93    /// Browser brand (`chrome` | `edge` | `brave` | `opera`).
94    pub browser_brand: Option<String>,
95    /// Spoof browser geolocation. Format: `"latitude,longitude"`.
96    pub geolocation: Option<String>,
97    /// Page load stage to wait for. `complete` (default) or `domcontentloaded`.
98    pub rendering_stage: Option<String>,
99    /// Return the raw upstream response instead of the JSON envelope.
100    /// When true, callers must use `Client::scrape_proxified()` which
101    /// returns `reqwest::Response` directly.
102    pub proxified_response: bool,
103}
104
105impl ScrapeConfig {
106    /// Start a builder for `url`.
107    pub fn builder(url: impl Into<String>) -> ScrapeConfigBuilder {
108        ScrapeConfigBuilder {
109            cfg: ScrapeConfig {
110                url: url.into(),
111                ..Default::default()
112            },
113        }
114    }
115
116    /// Serialize the config into the query-parameter pairs that the
117    /// `/scrape` endpoint expects. Mirrors
118    /// `sdk/go/config_scrape.go::toAPIParamsWithValidation`.
119    pub fn to_query_pairs(&self) -> Result<Vec<(String, String)>, ScrapflyError> {
120        if self.url.is_empty() {
121            return Err(ScrapflyError::Config("url is required".into()));
122        }
123
124        let mut out: Vec<(String, String)> = Vec::new();
125        out.push(("url".into(), self.url.clone()));
126
127        if let Some(country) = &self.country {
128            out.push(("country".into(), country.to_lowercase()));
129        }
130        if let Some(pool) = &self.proxy_pool {
131            out.push(("proxy_pool".into(), pool.as_str().into()));
132        }
133
134        if self.render_js {
135            out.push(("render_js".into(), "true".into()));
136            if let Some(sel) = &self.wait_for_selector {
137                out.push(("wait_for_selector".into(), sel.clone()));
138            }
139            if let Some(wait) = self.rendering_wait {
140                out.push(("rendering_wait".into(), wait.to_string()));
141            }
142            if let Some(ref geo) = self.geolocation {
143                out.push(("geolocation".into(), geo.clone()));
144            }
145            if let Some(ref stage) = self.rendering_stage {
146                if stage != "complete" {
147                    out.push(("rendering_stage".into(), stage.clone()));
148                }
149            }
150            if self.auto_scroll {
151                out.push(("auto_scroll".into(), "true".into()));
152            }
153            if let Some(js) = &self.js {
154                out.push(("js".into(), url_safe_b64_encode(js)));
155            }
156            if let Some(sc) = &self.js_scenario {
157                let as_str = serde_json::to_string(sc)?;
158                out.push(("js_scenario".into(), url_safe_b64_encode(&as_str)));
159            }
160            for (name, value) in &self.screenshots {
161                if value.is_empty() {
162                    return Err(ScrapflyError::Config(format!(
163                        "screenshots[{}] requires either a selector or 'fullpage'",
164                        name
165                    )));
166                }
167                out.push((format!("screenshots[{}]", name), value.clone()));
168            }
169            if !self.screenshot_flags.is_empty() {
170                let joined = self
171                    .screenshot_flags
172                    .iter()
173                    .map(|f| f.as_str())
174                    .collect::<Vec<_>>()
175                    .join(",");
176                out.push(("screenshot_flags".into(), joined));
177            }
178        }
179
180        if self.asp {
181            out.push(("asp".into(), "true".into()));
182        }
183        if self.retry == Some(false) {
184            out.push(("retry".into(), "false".into()));
185        }
186        if self.cache {
187            out.push(("cache".into(), "true".into()));
188            if let Some(ttl) = self.cache_ttl {
189                out.push(("cache_ttl".into(), ttl.to_string()));
190            }
191            if self.cache_clear {
192                out.push(("cache_clear".into(), "true".into()));
193            }
194        }
195        if let Some(timeout) = self.timeout {
196            out.push(("timeout".into(), timeout.to_string()));
197        }
198        if let Some(budget) = self.cost_budget {
199            out.push(("cost_budget".into(), budget.to_string()));
200        }
201        if self.debug {
202            out.push(("debug".into(), "true".into()));
203        }
204        if self.ssl {
205            out.push(("ssl".into(), "true".into()));
206        }
207        if self.dns {
208            out.push(("dns".into(), "true".into()));
209        }
210        if let Some(cid) = &self.correlation_id {
211            out.push(("correlation_id".into(), cid.clone()));
212        }
213        if !self.tags.is_empty() {
214            out.push(("tags".into(), self.tags.join(",")));
215        }
216        if let Some(wh) = &self.webhook {
217            out.push(("webhook_name".into(), wh.clone()));
218        }
219        if let Some(session) = &self.session {
220            out.push(("session".into(), session.clone()));
221            if self.session_sticky_proxy {
222                out.push(("session_sticky_proxy".into(), "true".into()));
223            }
224        }
225        if let Some(os) = &self.os {
226            out.push(("os".into(), os.clone()));
227        }
228        if !self.lang.is_empty() {
229            out.push(("lang".into(), self.lang.join(",")));
230        }
231        if let Some(bb) = &self.browser_brand {
232            out.push(("browser_brand".into(), bb.clone()));
233        }
234        if self.proxified_response {
235            out.push(("proxified_response".into(), "true".into()));
236        }
237
238        if let Some(format) = self.format {
239            let mut val = format.as_str().to_string();
240            if !self.format_options.is_empty() {
241                val.push(':');
242                val.push_str(
243                    &self
244                        .format_options
245                        .iter()
246                        .map(|f| f.as_str())
247                        .collect::<Vec<_>>()
248                        .join(","),
249                );
250            }
251            out.push(("format".into(), val));
252        }
253
254        // Extraction — exclusivity enforced by builder.
255        if let Some(tpl) = &self.extraction_template {
256            out.push(("extraction_template".into(), tpl.clone()));
257        } else if let Some(tpl) = &self.extraction_ephemeral_template {
258            let as_str = serde_json::to_string(tpl)?;
259            out.push((
260                "extraction_template".into(),
261                format!("ephemeral:{}", url_safe_b64_encode(&as_str)),
262            ));
263        } else if let Some(prompt) = &self.extraction_prompt {
264            out.push(("extraction_prompt".into(), prompt.clone()));
265        } else if let Some(model) = self.extraction_model {
266            out.push(("extraction_model".into(), model.as_str().into()));
267        }
268
269        // Headers → `headers[key]=value`.
270        for (k, v) in &self.headers {
271            if k.is_empty() || v.is_empty() {
272                return Err(ScrapflyError::Config(
273                    "headers key and value cannot be empty".into(),
274                ));
275            }
276            out.push((format!("headers[{}]", k.to_lowercase()), v.clone()));
277        }
278
279        // Cookies merged into `headers[cookie]`.
280        if !self.cookies.is_empty() {
281            let cookie_str = self
282                .cookies
283                .iter()
284                .map(|(k, v)| format!("{}={}", k, v))
285                .collect::<Vec<_>>()
286                .join("; ");
287            // Check for any existing `headers[cookie]` from above.
288            let existing = out
289                .iter()
290                .position(|(k, _)| k.eq_ignore_ascii_case("headers[cookie]"));
291            match existing {
292                Some(idx) => {
293                    let existing_val = out[idx].1.clone();
294                    out[idx].1 = format!("{}; {}", existing_val, cookie_str);
295                }
296                None => out.push(("headers[cookie]".into(), cookie_str)),
297            }
298        }
299
300        Ok(out)
301    }
302}
303
304/// Builder for [`ScrapeConfig`].
305#[derive(Debug, Clone)]
306pub struct ScrapeConfigBuilder {
307    cfg: ScrapeConfig,
308}
309
310impl ScrapeConfigBuilder {
311    /// Set HTTP method.
312    pub fn method(mut self, m: HttpMethod) -> Self {
313        self.cfg.method = Some(m);
314        self
315    }
316    /// Set raw request body.
317    pub fn body(mut self, b: impl Into<String>) -> Self {
318        self.cfg.body = Some(b.into());
319        self
320    }
321    /// Set a custom header.
322    pub fn header(mut self, k: impl Into<String>, v: impl Into<String>) -> Self {
323        self.cfg.headers.insert(k.into(), v.into());
324        self
325    }
326    /// Replace the full header map.
327    pub fn headers(mut self, headers: BTreeMap<String, String>) -> Self {
328        self.cfg.headers = headers;
329        self
330    }
331    /// Set a cookie.
332    pub fn cookie(mut self, k: impl Into<String>, v: impl Into<String>) -> Self {
333        self.cfg.cookies.insert(k.into(), v.into());
334        self
335    }
336    /// Set proxy country.
337    pub fn country(mut self, c: impl Into<String>) -> Self {
338        self.cfg.country = Some(c.into());
339        self
340    }
341    /// Set proxy pool.
342    pub fn proxy_pool(mut self, p: ProxyPool) -> Self {
343        self.cfg.proxy_pool = Some(p);
344        self
345    }
346    /// Enable JS rendering.
347    pub fn render_js(mut self, v: bool) -> Self {
348        self.cfg.render_js = v;
349        self
350    }
351    /// Enable ASP bypass.
352    pub fn asp(mut self, v: bool) -> Self {
353        self.cfg.asp = v;
354        self
355    }
356    /// Enable cache.
357    pub fn cache(mut self, v: bool) -> Self {
358        self.cfg.cache = v;
359        self
360    }
361    /// Set cache TTL.
362    pub fn cache_ttl(mut self, v: u32) -> Self {
363        self.cfg.cache_ttl = Some(v);
364        self
365    }
366    /// Force cache refresh.
367    pub fn cache_clear(mut self, v: bool) -> Self {
368        self.cfg.cache_clear = v;
369        self
370    }
371    /// Set the maximum API credit cost the caller will accept for this
372    /// request. The server rejects the request pre-flight if its estimate
373    /// exceeds the budget, so callers get a fast failure instead of a
374    /// surprise bill.
375    pub fn cost_budget(mut self, v: u32) -> Self {
376        self.cfg.cost_budget = Some(v);
377        self
378    }
379    /// Set request timeout (ms).
380    pub fn timeout(mut self, v: u32) -> Self {
381        self.cfg.timeout = Some(v);
382        self
383    }
384    /// Set automatic retry flag.
385    pub fn retry(mut self, v: bool) -> Self {
386        self.cfg.retry = Some(v);
387        self
388    }
389    /// Set session name.
390    pub fn session(mut self, v: impl Into<String>) -> Self {
391        self.cfg.session = Some(v.into());
392        self
393    }
394    /// Sticky proxy in a session.
395    pub fn session_sticky_proxy(mut self, v: bool) -> Self {
396        self.cfg.session_sticky_proxy = v;
397        self
398    }
399    /// Add a tag.
400    pub fn tag(mut self, v: impl Into<String>) -> Self {
401        self.cfg.tags.push(v.into());
402        self
403    }
404    /// Set all tags.
405    pub fn tags(mut self, v: Vec<String>) -> Self {
406        self.cfg.tags = v;
407        self
408    }
409    /// Set webhook name.
410    pub fn webhook(mut self, v: impl Into<String>) -> Self {
411        self.cfg.webhook = Some(v.into());
412        self
413    }
414    /// Enable debug mode.
415    pub fn debug(mut self, v: bool) -> Self {
416        self.cfg.debug = v;
417        self
418    }
419    /// Capture SSL details.
420    pub fn ssl(mut self, v: bool) -> Self {
421        self.cfg.ssl = v;
422        self
423    }
424    /// Capture DNS details.
425    pub fn dns(mut self, v: bool) -> Self {
426        self.cfg.dns = v;
427        self
428    }
429    /// Set correlation ID.
430    pub fn correlation_id(mut self, v: impl Into<String>) -> Self {
431        self.cfg.correlation_id = Some(v.into());
432        self
433    }
434    /// Set output format.
435    pub fn format(mut self, v: Format) -> Self {
436        self.cfg.format = Some(v);
437        self
438    }
439    /// Add a format option.
440    pub fn format_option(mut self, v: FormatOption) -> Self {
441        self.cfg.format_options.push(v);
442        self
443    }
444    /// Set saved extraction template name.
445    pub fn extraction_template(mut self, v: impl Into<String>) -> Self {
446        self.cfg.extraction_template = Some(v.into());
447        self
448    }
449    /// Set inline extraction template.
450    pub fn extraction_ephemeral_template(mut self, v: serde_json::Value) -> Self {
451        self.cfg.extraction_ephemeral_template = Some(v);
452        self
453    }
454    /// Set AI extraction prompt.
455    pub fn extraction_prompt(mut self, v: impl Into<String>) -> Self {
456        self.cfg.extraction_prompt = Some(v.into());
457        self
458    }
459    /// Set extraction model.
460    pub fn extraction_model(mut self, v: ExtractionModel) -> Self {
461        self.cfg.extraction_model = Some(v);
462        self
463    }
464    /// Set wait-for-selector.
465    pub fn wait_for_selector(mut self, v: impl Into<String>) -> Self {
466        self.cfg.wait_for_selector = Some(v.into());
467        self
468    }
469    /// Set extra rendering wait (ms).
470    pub fn rendering_wait(mut self, v: u32) -> Self {
471        self.cfg.rendering_wait = Some(v);
472        self
473    }
474    /// Enable auto-scroll.
475    pub fn auto_scroll(mut self, v: bool) -> Self {
476        self.cfg.auto_scroll = v;
477        self
478    }
479    /// Add a named screenshot.
480    pub fn screenshot(mut self, name: impl Into<String>, selector: impl Into<String>) -> Self {
481        self.cfg.screenshots.insert(name.into(), selector.into());
482        self
483    }
484    /// Add a screenshot flag.
485    pub fn screenshot_flag(mut self, v: ScreenshotFlag) -> Self {
486        self.cfg.screenshot_flags.push(v);
487        self
488    }
489    /// Set inline JS code.
490    pub fn js(mut self, v: impl Into<String>) -> Self {
491        self.cfg.js = Some(v.into());
492        self
493    }
494    /// Set JS scenario (as serde_json::Value).
495    pub fn js_scenario(mut self, v: serde_json::Value) -> Self {
496        self.cfg.js_scenario = Some(v);
497        self
498    }
499    /// Set OS fingerprint hint.
500    pub fn os(mut self, v: impl Into<String>) -> Self {
501        self.cfg.os = Some(v.into());
502        self
503    }
504    /// Add an Accept-Language value.
505    pub fn lang(mut self, v: impl Into<String>) -> Self {
506        self.cfg.lang.push(v.into());
507        self
508    }
509    /// Set browser brand.
510    pub fn browser_brand(mut self, v: impl Into<String>) -> Self {
511        self.cfg.browser_brand = Some(v.into());
512        self
513    }
514    /// Enable proxified response mode (raw upstream pass-through).
515    /// Spoof browser geolocation. Format: `"latitude,longitude"`.
516    pub fn geolocation(mut self, v: impl Into<String>) -> Self {
517        self.cfg.geolocation = Some(v.into());
518        self
519    }
520    /// Set page load stage: `"complete"` (default) or `"domcontentloaded"`.
521    pub fn rendering_stage(mut self, v: impl Into<String>) -> Self {
522        self.cfg.rendering_stage = Some(v.into());
523        self
524    }
525    /// Enable proxified response mode (raw upstream pass-through).
526    pub fn proxified_response(mut self) -> Self {
527        self.cfg.proxified_response = true;
528        self
529    }
530
531    /// Finalize the builder, enforcing the mutual-exclusion rules for the
532    /// extraction fields.
533    pub fn build(self) -> Result<ScrapeConfig, ScrapflyError> {
534        let cfg = self.cfg;
535        let count = [
536            cfg.extraction_template.is_some(),
537            cfg.extraction_ephemeral_template.is_some(),
538            cfg.extraction_prompt.is_some(),
539            cfg.extraction_model.is_some(),
540        ]
541        .iter()
542        .filter(|x| **x)
543        .count();
544        if count > 1 {
545            return Err(ScrapflyError::Config(
546                "extraction_template, extraction_ephemeral_template, extraction_prompt and extraction_model are mutually exclusive"
547                    .into(),
548            ));
549        }
550        Ok(cfg)
551    }
552}