Skip to main content

stygian_plugin/adapters/
extraction_engine.rs

1//! Extraction engine: core adapter that executes extractions against HTML
2//!
3//! Uses scraper (CSS selectors) to extract and transform data according to templates.
4
5use crate::domain::{ExtractionRequest, ExtractionResult, RegionStatus};
6use crate::error::PluginError;
7use crate::{Result, ports::PluginExtractionPort};
8use async_trait::async_trait;
9use scraper::{Html, Selector as ScraperSelector};
10use std::time::Instant;
11
12/// Extraction engine: executes templates against HTML
13///
14/// Uses the `scraper` crate to evaluate CSS selectors against HTML,
15/// applies transformations, and builds structured results.
16pub struct ExtractionEngine;
17
18impl ExtractionEngine {
19    /// Execute an extraction request
20    pub fn execute(request: &ExtractionRequest) -> Result<ExtractionResult> {
21        request.validate()?;
22        request.template.validate()?;
23
24        let start = Instant::now();
25        let document = Html::parse_document(&request.html);
26
27        let mut result = ExtractionResult::new(request.idempotency_key);
28        let mut successful_regions = 0;
29
30        for region in &request.template.regions {
31            region.validate()?;
32
33            match execute_region(&document, region) {
34                Ok(extracted_values) => {
35                    let count = extracted_values.len();
36
37                    // For single values, return as-is; for multiple, return array
38                    let result_value = if count == 1 {
39                        serde_json::Value::String(extracted_values.into_iter().next().ok_or_else(
40                            || {
41                                PluginError::ExtractionError(
42                                    "selector matched a single value, but none were extracted"
43                                        .to_string(),
44                                )
45                            },
46                        )?)
47                    } else {
48                        serde_json::Value::Array(
49                            extracted_values
50                                .into_iter()
51                                .map(serde_json::Value::String)
52                                .collect(),
53                        )
54                    };
55
56                    result
57                        .data
58                        .insert(region.name.clone(), result_value.clone());
59                    result.metadata.region_status.insert(
60                        region.name.clone(),
61                        RegionStatus {
62                            success: true,
63                            matched_count: count,
64                            error: None,
65                        },
66                    );
67                    successful_regions += 1;
68                }
69                Err(e) => {
70                    result.metadata.region_status.insert(
71                        region.name.clone(),
72                        RegionStatus {
73                            success: false,
74                            matched_count: 0,
75                            error: Some(e.to_string()),
76                        },
77                    );
78                    result = result.with_error(format!("Region '{}': {}", region.name, e));
79                }
80            }
81        }
82
83        // Calculate success rate
84        if request.template.regions.is_empty() {
85            result.metadata.selector_success_rate = 100.0;
86        } else {
87            let successful = u16::try_from(successful_regions).unwrap_or(u16::MAX);
88            let total = u16::try_from(request.template.regions.len()).unwrap_or(u16::MAX);
89            result.metadata.selector_success_rate =
90                (f32::from(successful) / f32::from(total)) * 100.0;
91        }
92
93        let elapsed = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
94        result = result.set_elapsed_ms(elapsed);
95
96        Ok(result)
97    }
98}
99
100/// Execute extraction for a single region
101fn execute_region(document: &Html, region: &crate::domain::Region) -> Result<Vec<String>> {
102    // Check selector type and route accordingly
103    let selector_text = match &region.selector {
104        crate::domain::Selector::XPath(_) => {
105            return Err(crate::error::PluginError::ExtractionError(
106                "XPath selectors are not yet supported. Please use CSS selectors instead."
107                    .to_string(),
108            ));
109        }
110        crate::domain::Selector::Css(css) | crate::domain::Selector::Both { css, .. } => css,
111    };
112
113    // Parse as CSS selector
114    let selector = ScraperSelector::parse(selector_text).map_err(|e| {
115        crate::error::PluginError::SelectorError {
116            selector: selector_text.clone(),
117            reason: format!("Failed to parse CSS selector: {e:?}"),
118        }
119    })?;
120
121    let mut results = Vec::new();
122
123    // Select all matching elements
124    for element in document.select(&selector) {
125        let text = element.inner_html();
126
127        // Apply transformation chain to the extracted text
128        let transformed =
129            crate::domain::Transformation::apply_chain(&region.transformations, text)?;
130
131        results.push(transformed);
132    }
133
134    if results.is_empty() {
135        return Err(crate::error::PluginError::ExtractionError(format!(
136            "No elements matched CSS selector: {selector_text}"
137        )));
138    }
139
140    Ok(results)
141}
142
143#[async_trait]
144impl PluginExtractionPort for ExtractionEngine {
145    async fn execute(&self, request: &ExtractionRequest) -> Result<ExtractionResult> {
146        Self::execute(request)
147    }
148
149    async fn validate_selector(&self, html: &str, selector_expr: &str) -> Result<(bool, usize)> {
150        let document = Html::parse_document(html);
151
152        ScraperSelector::parse(selector_expr).map_or(Ok((false, 0)), |selector| {
153            let count = document.select(&selector).count();
154            Ok((true, count))
155        })
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162    use crate::domain::{ExtractionTemplate, Region, Selector, Transformation};
163    use serde_json::{Value, json};
164
165    #[test]
166    fn test_extract_single_element() -> crate::Result<()> {
167        let html = r#"<div><p class="title">Hello World</p></div>"#;
168
169        let region = Region::new("title", Selector::css(".title"), json!({"type": "string"}));
170        let template = ExtractionTemplate::new("Test").with_region(region);
171
172        let request = ExtractionRequest::new(template, "http://example.com", html);
173        let result = ExtractionEngine::execute(&request)?;
174
175        assert!(result.is_fully_successful());
176        assert_eq!(
177            result.data.get("title"),
178            Some(&serde_json::json!("Hello World"))
179        );
180        Ok(())
181    }
182
183    #[test]
184    fn test_extract_multiple_elements() -> crate::Result<()> {
185        let html = r#"
186            <div>
187                <p class="item">Item 1</p>
188                <p class="item">Item 2</p>
189                <p class="item">Item 3</p>
190            </div>
191        "#;
192
193        let region = Region::new("items", Selector::css(".item"), json!({"type": "array"}));
194        let template = ExtractionTemplate::new("Test").with_region(region);
195
196        let request = ExtractionRequest::new(template, "http://example.com", html);
197        let result = ExtractionEngine::execute(&request)?;
198
199        let items_len = result
200            .data
201            .get("items")
202            .and_then(Value::as_array)
203            .map(std::vec::Vec::len);
204        assert_eq!(items_len, Some(3));
205        Ok(())
206    }
207
208    #[test]
209    fn test_extract_with_transformation() -> crate::Result<()> {
210        let html = r#"<div><p class="price">  $19.99  </p></div>"#;
211
212        let region = Region::new("price", Selector::css(".price"), json!({"type": "string"}))
213            .with_transformation(Transformation::Trim);
214        let template = ExtractionTemplate::new("Test").with_region(region);
215
216        let request = ExtractionRequest::new(template, "http://example.com", html);
217        let result = ExtractionEngine::execute(&request)?;
218
219        assert_eq!(result.data.get("price"), Some(&serde_json::json!("$19.99")));
220        Ok(())
221    }
222
223    #[tokio::test]
224    async fn test_selector_validation() -> crate::Result<()> {
225        let html = r#"<div><p class="test">Content</p></div>"#;
226        let engine = ExtractionEngine;
227
228        let (valid, count) = engine.validate_selector(html, ".test").await?;
229        assert!(valid);
230        assert_eq!(count, 1);
231
232        let (valid, count) = engine.validate_selector(html, ".nonexistent").await?;
233        assert!(valid);
234        assert_eq!(count, 0);
235        Ok(())
236    }
237
238    #[tokio::test]
239    async fn test_invalid_css_selector() -> crate::Result<()> {
240        let html = "<div><p>Content</p></div>";
241        let engine = ExtractionEngine;
242
243        let (valid, _) = engine.validate_selector(html, ">>>invalid").await?;
244        assert!(!valid);
245        Ok(())
246    }
247}