Skip to main content

stygian_plugin/domain/
extraction.rs

1//! Extraction template, request, and result types
2
3use crate::domain::idempotency::IdempotencyKey;
4use crate::domain::selector::Selector;
5use crate::domain::transformation::Transformation;
6use crate::reliability::ReliabilityScore;
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9use serde_json::Value;
10use std::collections::HashMap;
11
12/// A named region within a template to extract data from
13///
14/// Each region represents a distinct zone on the page with its own
15/// selectors and transformations.
16///
17/// # Example
18///
19/// ```
20/// use stygian_plugin::domain::Region;
21/// use stygian_plugin::domain::Selector;
22///
23/// let region = Region {
24///     name: "product-title".to_string(),
25///     selector: Selector::css(".product-name".to_string()),
26///     schema: serde_json::json!({"type": "string"}),
27///     transformations: vec![],
28/// };
29/// ```
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct Region {
32    /// Region name (e.g., "product-title", "price", "rating")
33    pub name: String,
34
35    /// Primary selector (`CSS` or `XPath`) to locate the element
36    pub selector: Selector,
37
38    /// JSON schema describing the expected output shape
39    pub schema: Value,
40
41    /// Ordered transformations to apply to extracted values
42    pub transformations: Vec<Transformation>,
43}
44
45impl Region {
46    /// Create a new region with minimal configuration
47    pub fn new(name: impl Into<String>, selector: Selector, schema: Value) -> Self {
48        Self {
49            name: name.into(),
50            selector,
51            schema,
52            transformations: vec![],
53        }
54    }
55
56    /// Add a transformation to the pipeline
57    #[must_use]
58    pub fn with_transformation(mut self, transformation: Transformation) -> Self {
59        self.transformations.push(transformation);
60        self
61    }
62
63    /// Validate region configuration
64    ///
65    /// # Errors
66    ///
67    /// Returns [`crate::error::PluginError::TemplateValidationError`] when
68    /// the region name is empty or the JSON schema is not an object. Returns
69    /// [`crate::error::PluginError::SelectorError`] when the region's
70    /// selector fails its own `validate()` call.
71    pub fn validate(&self) -> crate::Result<()> {
72        if self.name.is_empty() {
73            return Err(crate::error::PluginError::TemplateValidationError(
74                "region name cannot be empty".to_string(),
75            ));
76        }
77        if !self.schema.is_object() {
78            return Err(crate::error::PluginError::TemplateValidationError(format!(
79                "region schema must be a JSON object, got {}",
80                self.schema.get("type").unwrap_or(&Value::Null)
81            )));
82        }
83        // Validate the selector syntax
84        self.selector.validate()?;
85        Ok(())
86    }
87}
88
89/// A reusable extraction template defining how to extract data from a page
90///
91/// Templates combine multiple regions, each with selectors and transformations.
92/// A template is the core unit of plugin configuration and is persisted for reuse.
93///
94/// # Example
95///
96/// ```
97/// use stygian_plugin::domain::{ExtractionTemplate, Region, Selector};
98/// use serde_json::json;
99///
100/// let template = ExtractionTemplate {
101///     id: uuid::Uuid::new_v4(),
102///     name: "Product Listing".to_string(),
103///     description: Some("Extract product cards from a listing page".to_string()),
104///     regions: vec![],
105///     metadata: Default::default(),
106/// };
107/// ```
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct ExtractionTemplate {
110    /// Unique identifier for this template
111    pub id: uuid::Uuid,
112
113    /// User-friendly template name
114    pub name: String,
115
116    /// Optional description
117    pub description: Option<String>,
118
119    /// Regions (named extraction zones) in this template
120    pub regions: Vec<Region>,
121
122    /// Metadata (timestamps, version, etc.)
123    pub metadata: TemplateMetadata,
124}
125
126/// Metadata about a template
127#[derive(Debug, Clone, Serialize, Deserialize)]
128pub struct TemplateMetadata {
129    /// When template was created
130    pub created_at: DateTime<Utc>,
131
132    /// When template was last modified
133    pub updated_at: DateTime<Utc>,
134
135    /// When template was last used
136    pub last_used_at: Option<DateTime<Utc>>,
137
138    /// Number of times this template has been used
139    pub usage_count: u64,
140
141    /// Template version (for migration purposes)
142    pub version: u32,
143
144    /// Optional user-defined tags
145    pub tags: Vec<String>,
146}
147
148impl Default for TemplateMetadata {
149    fn default() -> Self {
150        let now = Utc::now();
151        Self {
152            created_at: now,
153            updated_at: now,
154            last_used_at: None,
155            usage_count: 0,
156            version: 1,
157            tags: vec![],
158        }
159    }
160}
161
162impl ExtractionTemplate {
163    /// Create a new template with defaults
164    pub fn new(name: impl Into<String>) -> Self {
165        Self {
166            id: uuid::Uuid::new_v4(),
167            name: name.into(),
168            description: None,
169            regions: vec![],
170            metadata: TemplateMetadata::default(),
171        }
172    }
173
174    /// Add a region to this template
175    #[must_use]
176    pub fn with_region(mut self, region: Region) -> Self {
177        self.regions.push(region);
178        self
179    }
180
181    /// Set template description
182    #[must_use]
183    pub fn with_description(mut self, desc: impl Into<String>) -> Self {
184        self.description = Some(desc.into());
185        self
186    }
187
188    /// Set template tags
189    #[must_use]
190    pub fn with_tags(mut self, tags: Vec<String>) -> Self {
191        self.metadata.tags = tags;
192        self
193    }
194
195    /// Validate the entire template
196    ///
197    /// # Errors
198    ///
199    /// Returns [`crate::error::PluginError::TemplateValidationError`] when
200    /// the template name is empty. Propagates any error returned by the
201    /// per-region `validate()` call (empty name, non-object schema, or
202    /// invalid selector).
203    pub fn validate(&self) -> crate::Result<()> {
204        if self.name.is_empty() {
205            return Err(crate::error::PluginError::TemplateValidationError(
206                "template name cannot be empty".to_string(),
207            ));
208        }
209        for region in &self.regions {
210            region.validate()?;
211        }
212        Ok(())
213    }
214
215    /// Update usage statistics
216    pub fn mark_used(&mut self) {
217        self.metadata.usage_count += 1;
218        self.metadata.last_used_at = Some(Utc::now());
219        self.metadata.updated_at = Utc::now();
220    }
221}
222
223/// Request to extract data from a page using a template
224#[derive(Debug, Clone, Serialize, Deserialize)]
225pub struct ExtractionRequest {
226    /// Template to use for extraction
227    pub template: ExtractionTemplate,
228
229    /// Target URL (for context/logging)
230    pub url: String,
231
232    /// HTML content of the page to extract from
233    pub html: String,
234
235    /// Idempotency key for safe retries
236    pub idempotency_key: IdempotencyKey,
237
238    /// Timeout in milliseconds
239    pub timeout_ms: u64,
240
241    /// Optional extraction context (arbitrary JSON)
242    pub context: Option<Value>,
243}
244
245impl ExtractionRequest {
246    /// Create a new extraction request
247    pub fn new(
248        template: ExtractionTemplate,
249        url: impl Into<String>,
250        html: impl Into<String>,
251    ) -> Self {
252        Self {
253            template,
254            url: url.into(),
255            html: html.into(),
256            idempotency_key: IdempotencyKey::new(),
257            timeout_ms: 30_000,
258            context: None,
259        }
260    }
261
262    /// Set idempotency key
263    #[must_use]
264    pub const fn with_idempotency_key(mut self, key: IdempotencyKey) -> Self {
265        self.idempotency_key = key;
266        self
267    }
268
269    /// Set timeout
270    #[must_use]
271    pub const fn with_timeout(mut self, ms: u64) -> Self {
272        self.timeout_ms = ms;
273        self
274    }
275
276    /// Set context
277    #[must_use]
278    pub fn with_context(mut self, context: Value) -> Self {
279        self.context = Some(context);
280        self
281    }
282
283    /// Validate the request
284    ///
285    /// # Errors
286    ///
287    /// Returns [`crate::error::PluginError::TemplateValidationError`] when
288    /// the embedded template is invalid. Returns
289    /// [`crate::error::PluginError::ExtractionError`] when the request URL
290    /// or HTML payload is empty.
291    pub fn validate(&self) -> crate::Result<()> {
292        self.template.validate()?;
293        if self.url.is_empty() {
294            return Err(crate::error::PluginError::ExtractionError(
295                "URL cannot be empty".to_string(),
296            ));
297        }
298        if self.html.is_empty() {
299            return Err(crate::error::PluginError::ExtractionError(
300                "HTML cannot be empty".to_string(),
301            ));
302        }
303        Ok(())
304    }
305}
306
307/// Result of a successful extraction
308#[derive(Debug, Clone, Serialize, Deserialize)]
309pub struct ExtractionResult {
310    /// Extracted data keyed by region name
311    pub data: HashMap<String, Value>,
312
313    /// Metadata about the extraction
314    pub metadata: ExtractionMetadata,
315}
316
317/// Metadata about an extraction result
318#[derive(Debug, Clone, Serialize, Deserialize)]
319pub struct ExtractionMetadata {
320    /// Idempotency key used
321    pub idempotency_key: IdempotencyKey,
322
323    /// When extraction was completed
324    pub completed_at: DateTime<Utc>,
325
326    /// Elapsed time in milliseconds
327    pub elapsed_ms: u64,
328
329    /// Success rate for selectors (0-100)
330    pub selector_success_rate: f32,
331
332    /// Per-region extraction status
333    pub region_status: HashMap<String, RegionStatus>,
334
335    /// Optional error details
336    pub errors: Vec<String>,
337
338    /// Optional reliability score for the extraction output (T87).
339    ///
340    /// This field is **additive** — older consumers that don't know about
341    /// reliability scoring see `None` (or the field omitted entirely when
342    /// serialized with `skip_serializing_if = "Option::is_none"`).
343    /// Default-on per the T87 spec; no feature gate is required.
344    #[serde(default, skip_serializing_if = "Option::is_none")]
345    pub reliability: Option<ReliabilityScore>,
346}
347
348/// Status of extraction for a single region
349#[derive(Debug, Clone, Serialize, Deserialize)]
350pub struct RegionStatus {
351    /// Whether extraction succeeded
352    pub success: bool,
353
354    /// Number of elements matched
355    pub matched_count: usize,
356
357    /// Error message if failed
358    pub error: Option<String>,
359}
360
361impl ExtractionResult {
362    /// Create a new extraction result
363    #[must_use]
364    pub fn new(idempotency_key: IdempotencyKey) -> Self {
365        Self {
366            data: HashMap::new(),
367            metadata: ExtractionMetadata {
368                idempotency_key,
369                completed_at: Utc::now(),
370                elapsed_ms: 0,
371                selector_success_rate: 0.0,
372                region_status: HashMap::new(),
373                errors: vec![],
374                reliability: None,
375            },
376        }
377    }
378
379    /// Add extracted data for a region
380    #[must_use]
381    pub fn with_region_data(mut self, region_name: impl Into<String>, data: Value) -> Self {
382        self.data.insert(region_name.into(), data);
383        self
384    }
385
386    /// Add an error
387    #[must_use]
388    pub fn with_error(mut self, error: impl Into<String>) -> Self {
389        self.metadata.errors.push(error.into());
390        self
391    }
392
393    /// Update elapsed time
394    #[must_use]
395    pub const fn set_elapsed_ms(mut self, ms: u64) -> Self {
396        self.metadata.elapsed_ms = ms;
397        self
398    }
399
400    /// Calculate and set selector success rate
401    #[expect(
402        clippy::cast_precision_loss,
403        reason = "region counts are small enough to be safe as f32"
404    )]
405    pub fn calculate_success_rate(&mut self) {
406        if self.metadata.region_status.is_empty() {
407            self.metadata.selector_success_rate = 100.0;
408            return;
409        }
410        let successful = self
411            .metadata
412            .region_status
413            .values()
414            .filter(|status| status.success)
415            .count();
416        self.metadata.selector_success_rate =
417            (successful as f32 / self.metadata.region_status.len() as f32) * 100.0;
418    }
419
420    /// Check if extraction was fully successful
421    #[must_use]
422    pub fn is_fully_successful(&self) -> bool {
423        self.metadata.selector_success_rate >= 100.0 && self.metadata.errors.is_empty()
424    }
425}