Skip to main content

scrapfly_sdk/config/
extraction.rs

1//! Extraction endpoint configuration — ported from `sdk/go/config_extraction.go`.
2
3use crate::enums::{CompressionFormat, ExtractionModel};
4use crate::error::ScrapflyError;
5
6use super::url_safe_b64_encode;
7
8/// Configuration for a `POST /extraction` request.
9#[derive(Debug, Clone, Default)]
10pub struct ExtractionConfig {
11    /// Document bytes (required).
12    pub body: Vec<u8>,
13    /// Content type, e.g. `text/html` (required).
14    pub content_type: String,
15    /// Original URL (helps the AI with context).
16    pub url: Option<String>,
17    /// Character set.
18    pub charset: Option<String>,
19    /// Saved extraction template name.
20    pub extraction_template: Option<String>,
21    /// Inline (ephemeral) template.
22    pub extraction_ephemeral_template: Option<serde_json::Value>,
23    /// AI extraction prompt.
24    pub extraction_prompt: Option<String>,
25    /// Extraction model.
26    pub extraction_model: Option<ExtractionModel>,
27    /// Body is compressed.
28    pub is_document_compressed: bool,
29    /// Compression format.
30    pub document_compression_format: Option<CompressionFormat>,
31    /// Webhook name.
32    pub webhook: Option<String>,
33    /// Maximum time in seconds for extraction processing.
34    pub timeout: Option<u32>,
35}
36
37impl ExtractionConfig {
38    /// Start a builder.
39    pub fn builder(body: Vec<u8>, content_type: impl Into<String>) -> ExtractionConfigBuilder {
40        ExtractionConfigBuilder {
41            cfg: ExtractionConfig {
42                body,
43                content_type: content_type.into(),
44                ..Default::default()
45            },
46        }
47    }
48
49    /// Query params (key is added separately by the client).
50    pub fn to_query_pairs(&self) -> Result<Vec<(String, String)>, ScrapflyError> {
51        if self.body.is_empty() {
52            return Err(ScrapflyError::Config("body is required".into()));
53        }
54        if self.content_type.is_empty() {
55            return Err(ScrapflyError::Config("content_type is required".into()));
56        }
57        let tpl_count = [
58            self.extraction_template.is_some(),
59            self.extraction_ephemeral_template.is_some(),
60        ]
61        .iter()
62        .filter(|x| **x)
63        .count();
64        if tpl_count > 1 {
65            return Err(ScrapflyError::Config(
66                "cannot use both extraction_template and extraction_ephemeral_template".into(),
67            ));
68        }
69
70        let mut out = Vec::new();
71        out.push(("content_type".into(), self.content_type.clone()));
72        if let Some(u) = &self.url {
73            out.push(("url".into(), u.clone()));
74        }
75        if let Some(c) = &self.charset {
76            out.push(("charset".into(), c.clone()));
77        }
78        if let Some(t) = &self.extraction_template {
79            out.push(("extraction_template".into(), t.clone()));
80        }
81        if let Some(t) = &self.extraction_ephemeral_template {
82            let s = serde_json::to_string(t)?;
83            out.push((
84                "extraction_template".into(),
85                format!("ephemeral:{}", url_safe_b64_encode(&s)),
86            ));
87        }
88        if let Some(p) = &self.extraction_prompt {
89            out.push(("extraction_prompt".into(), p.clone()));
90        }
91        if let Some(m) = self.extraction_model {
92            out.push(("extraction_model".into(), m.as_str().into()));
93        }
94        if let Some(wh) = &self.webhook {
95            out.push(("webhook_name".into(), wh.clone()));
96        }
97        if let Some(t) = self.timeout {
98            out.push(("timeout".into(), t.to_string()));
99        }
100        Ok(out)
101    }
102}
103
104/// Builder for [`ExtractionConfig`].
105#[derive(Debug, Clone)]
106pub struct ExtractionConfigBuilder {
107    cfg: ExtractionConfig,
108}
109
110impl ExtractionConfigBuilder {
111    /// Original URL.
112    pub fn url(mut self, v: impl Into<String>) -> Self {
113        self.cfg.url = Some(v.into());
114        self
115    }
116    /// Character set.
117    pub fn charset(mut self, v: impl Into<String>) -> Self {
118        self.cfg.charset = Some(v.into());
119        self
120    }
121    /// Saved template name.
122    pub fn extraction_template(mut self, v: impl Into<String>) -> Self {
123        self.cfg.extraction_template = Some(v.into());
124        self
125    }
126    /// Inline template.
127    pub fn extraction_ephemeral_template(mut self, v: serde_json::Value) -> Self {
128        self.cfg.extraction_ephemeral_template = Some(v);
129        self
130    }
131    /// AI prompt.
132    pub fn extraction_prompt(mut self, v: impl Into<String>) -> Self {
133        self.cfg.extraction_prompt = Some(v.into());
134        self
135    }
136    /// Model.
137    pub fn extraction_model(mut self, v: ExtractionModel) -> Self {
138        self.cfg.extraction_model = Some(v);
139        self
140    }
141    /// Body is compressed.
142    pub fn is_document_compressed(mut self, v: bool) -> Self {
143        self.cfg.is_document_compressed = v;
144        self
145    }
146    /// Compression format.
147    pub fn document_compression_format(mut self, v: CompressionFormat) -> Self {
148        self.cfg.document_compression_format = Some(v);
149        self
150    }
151    /// Webhook name.
152    pub fn timeout(mut self, v: u32) -> Self {
153        self.cfg.timeout = Some(v);
154        self
155    }
156    /// Set webhook name for post-extraction notification.
157    pub fn webhook(mut self, v: impl Into<String>) -> Self {
158        self.cfg.webhook = Some(v.into());
159        self
160    }
161    /// Finalize the builder.
162    pub fn build(self) -> Result<ExtractionConfig, ScrapflyError> {
163        if self.cfg.body.is_empty() {
164            return Err(ScrapflyError::Config("body is required".into()));
165        }
166        if self.cfg.content_type.is_empty() {
167            return Err(ScrapflyError::Config("content_type is required".into()));
168        }
169        Ok(self.cfg)
170    }
171}