Skip to main content

stygian_plugin/domain/
extraction.rs

1//! Extraction template, request, and result types
2
3use crate::domain::idempotency::IdempotencyKey;
4use crate::domain::selector::Selector;
5use crate::domain::transformation::Transformation;
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use serde_json::Value;
9use std::collections::HashMap;
10
11/// A named region within a template to extract data from
12///
13/// Each region represents a distinct zone on the page with its own
14/// selectors and transformations.
15///
16/// # Example
17///
18/// ```
19/// use stygian_plugin::domain::Region;
20/// use stygian_plugin::domain::Selector;
21///
22/// let region = Region {
23///     name: "product-title".to_string(),
24///     selector: Selector::css(".product-name".to_string()),
25///     schema: serde_json::json!({"type": "string"}),
26///     transformations: vec![],
27/// };
28/// ```
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct Region {
31    /// Region name (e.g., "product-title", "price", "rating")
32    pub name: String,
33
34    /// Primary selector (`CSS` or `XPath`) to locate the element
35    pub selector: Selector,
36
37    /// JSON schema describing the expected output shape
38    pub schema: Value,
39
40    /// Ordered transformations to apply to extracted values
41    pub transformations: Vec<Transformation>,
42}
43
44impl Region {
45    /// Create a new region with minimal configuration
46    pub fn new(name: impl Into<String>, selector: Selector, schema: Value) -> Self {
47        Self {
48            name: name.into(),
49            selector,
50            schema,
51            transformations: vec![],
52        }
53    }
54
55    /// Add a transformation to the pipeline
56    #[must_use]
57    pub fn with_transformation(mut self, transformation: Transformation) -> Self {
58        self.transformations.push(transformation);
59        self
60    }
61
62    /// Validate region configuration
63    pub fn validate(&self) -> crate::Result<()> {
64        if self.name.is_empty() {
65            return Err(crate::error::PluginError::TemplateValidationError(
66                "region name cannot be empty".to_string(),
67            ));
68        }
69        if !self.schema.is_object() {
70            return Err(crate::error::PluginError::TemplateValidationError(format!(
71                "region schema must be a JSON object, got {}",
72                self.schema.get("type").unwrap_or(&Value::Null)
73            )));
74        }
75        // Validate the selector syntax
76        self.selector.validate()?;
77        Ok(())
78    }
79}
80
81/// A reusable extraction template defining how to extract data from a page
82///
83/// Templates combine multiple regions, each with selectors and transformations.
84/// A template is the core unit of plugin configuration and is persisted for reuse.
85///
86/// # Example
87///
88/// ```
89/// use stygian_plugin::domain::{ExtractionTemplate, Region, Selector};
90/// use serde_json::json;
91///
92/// let template = ExtractionTemplate {
93///     id: uuid::Uuid::new_v4(),
94///     name: "Product Listing".to_string(),
95///     description: Some("Extract product cards from a listing page".to_string()),
96///     regions: vec![],
97///     metadata: Default::default(),
98/// };
99/// ```
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct ExtractionTemplate {
102    /// Unique identifier for this template
103    pub id: uuid::Uuid,
104
105    /// User-friendly template name
106    pub name: String,
107
108    /// Optional description
109    pub description: Option<String>,
110
111    /// Regions (named extraction zones) in this template
112    pub regions: Vec<Region>,
113
114    /// Metadata (timestamps, version, etc.)
115    pub metadata: TemplateMetadata,
116}
117
118/// Metadata about a template
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct TemplateMetadata {
121    /// When template was created
122    pub created_at: DateTime<Utc>,
123
124    /// When template was last modified
125    pub updated_at: DateTime<Utc>,
126
127    /// When template was last used
128    pub last_used_at: Option<DateTime<Utc>>,
129
130    /// Number of times this template has been used
131    pub usage_count: u64,
132
133    /// Template version (for migration purposes)
134    pub version: u32,
135
136    /// Optional user-defined tags
137    pub tags: Vec<String>,
138}
139
140impl Default for TemplateMetadata {
141    fn default() -> Self {
142        let now = Utc::now();
143        Self {
144            created_at: now,
145            updated_at: now,
146            last_used_at: None,
147            usage_count: 0,
148            version: 1,
149            tags: vec![],
150        }
151    }
152}
153
154impl ExtractionTemplate {
155    /// Create a new template with defaults
156    pub fn new(name: impl Into<String>) -> Self {
157        Self {
158            id: uuid::Uuid::new_v4(),
159            name: name.into(),
160            description: None,
161            regions: vec![],
162            metadata: TemplateMetadata::default(),
163        }
164    }
165
166    /// Add a region to this template
167    #[must_use]
168    pub fn with_region(mut self, region: Region) -> Self {
169        self.regions.push(region);
170        self
171    }
172
173    /// Set template description
174    #[must_use]
175    pub fn with_description(mut self, desc: impl Into<String>) -> Self {
176        self.description = Some(desc.into());
177        self
178    }
179
180    /// Set template tags
181    #[must_use]
182    pub fn with_tags(mut self, tags: Vec<String>) -> Self {
183        self.metadata.tags = tags;
184        self
185    }
186
187    /// Validate the entire template
188    pub fn validate(&self) -> crate::Result<()> {
189        if self.name.is_empty() {
190            return Err(crate::error::PluginError::TemplateValidationError(
191                "template name cannot be empty".to_string(),
192            ));
193        }
194        for region in &self.regions {
195            region.validate()?;
196        }
197        Ok(())
198    }
199
200    /// Update usage statistics
201    pub fn mark_used(&mut self) {
202        self.metadata.usage_count += 1;
203        self.metadata.last_used_at = Some(Utc::now());
204        self.metadata.updated_at = Utc::now();
205    }
206}
207
208/// Request to extract data from a page using a template
209#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct ExtractionRequest {
211    /// Template to use for extraction
212    pub template: ExtractionTemplate,
213
214    /// Target URL (for context/logging)
215    pub url: String,
216
217    /// HTML content of the page to extract from
218    pub html: String,
219
220    /// Idempotency key for safe retries
221    pub idempotency_key: IdempotencyKey,
222
223    /// Timeout in milliseconds
224    pub timeout_ms: u64,
225
226    /// Optional extraction context (arbitrary JSON)
227    pub context: Option<Value>,
228}
229
230impl ExtractionRequest {
231    /// Create a new extraction request
232    pub fn new(
233        template: ExtractionTemplate,
234        url: impl Into<String>,
235        html: impl Into<String>,
236    ) -> Self {
237        Self {
238            template,
239            url: url.into(),
240            html: html.into(),
241            idempotency_key: IdempotencyKey::new(),
242            timeout_ms: 30_000,
243            context: None,
244        }
245    }
246
247    /// Set idempotency key
248    #[must_use]
249    pub const fn with_idempotency_key(mut self, key: IdempotencyKey) -> Self {
250        self.idempotency_key = key;
251        self
252    }
253
254    /// Set timeout
255    #[must_use]
256    pub const fn with_timeout(mut self, ms: u64) -> Self {
257        self.timeout_ms = ms;
258        self
259    }
260
261    /// Set context
262    #[must_use]
263    pub fn with_context(mut self, context: Value) -> Self {
264        self.context = Some(context);
265        self
266    }
267
268    /// Validate the request
269    pub fn validate(&self) -> crate::Result<()> {
270        self.template.validate()?;
271        if self.url.is_empty() {
272            return Err(crate::error::PluginError::ExtractionError(
273                "URL cannot be empty".to_string(),
274            ));
275        }
276        if self.html.is_empty() {
277            return Err(crate::error::PluginError::ExtractionError(
278                "HTML cannot be empty".to_string(),
279            ));
280        }
281        Ok(())
282    }
283}
284
285/// Result of a successful extraction
286#[derive(Debug, Clone, Serialize, Deserialize)]
287pub struct ExtractionResult {
288    /// Extracted data keyed by region name
289    pub data: HashMap<String, Value>,
290
291    /// Metadata about the extraction
292    pub metadata: ExtractionMetadata,
293}
294
295/// Metadata about an extraction result
296#[derive(Debug, Clone, Serialize, Deserialize)]
297pub struct ExtractionMetadata {
298    /// Idempotency key used
299    pub idempotency_key: IdempotencyKey,
300
301    /// When extraction was completed
302    pub completed_at: DateTime<Utc>,
303
304    /// Elapsed time in milliseconds
305    pub elapsed_ms: u64,
306
307    /// Success rate for selectors (0-100)
308    pub selector_success_rate: f32,
309
310    /// Per-region extraction status
311    pub region_status: HashMap<String, RegionStatus>,
312
313    /// Optional error details
314    pub errors: Vec<String>,
315}
316
317/// Status of extraction for a single region
318#[derive(Debug, Clone, Serialize, Deserialize)]
319pub struct RegionStatus {
320    /// Whether extraction succeeded
321    pub success: bool,
322
323    /// Number of elements matched
324    pub matched_count: usize,
325
326    /// Error message if failed
327    pub error: Option<String>,
328}
329
330impl ExtractionResult {
331    /// Create a new extraction result
332    pub fn new(idempotency_key: IdempotencyKey) -> Self {
333        Self {
334            data: HashMap::new(),
335            metadata: ExtractionMetadata {
336                idempotency_key,
337                completed_at: Utc::now(),
338                elapsed_ms: 0,
339                selector_success_rate: 0.0,
340                region_status: HashMap::new(),
341                errors: vec![],
342            },
343        }
344    }
345
346    /// Add extracted data for a region
347    #[must_use]
348    pub fn with_region_data(mut self, region_name: impl Into<String>, data: Value) -> Self {
349        self.data.insert(region_name.into(), data);
350        self
351    }
352
353    /// Add an error
354    #[must_use]
355    pub fn with_error(mut self, error: impl Into<String>) -> Self {
356        self.metadata.errors.push(error.into());
357        self
358    }
359
360    /// Update elapsed time
361    #[must_use]
362    pub const fn set_elapsed_ms(mut self, ms: u64) -> Self {
363        self.metadata.elapsed_ms = ms;
364        self
365    }
366
367    /// Calculate and set selector success rate
368    #[expect(
369        clippy::cast_precision_loss,
370        reason = "region counts are small enough to be safe as f32"
371    )]
372    pub fn calculate_success_rate(&mut self) {
373        if self.metadata.region_status.is_empty() {
374            self.metadata.selector_success_rate = 100.0;
375            return;
376        }
377        let successful = self
378            .metadata
379            .region_status
380            .values()
381            .filter(|status| status.success)
382            .count();
383        self.metadata.selector_success_rate =
384            (successful as f32 / self.metadata.region_status.len() as f32) * 100.0;
385    }
386
387    /// Check if extraction was fully successful
388    pub fn is_fully_successful(&self) -> bool {
389        self.metadata.selector_success_rate >= 100.0 && self.metadata.errors.is_empty()
390    }
391}