Skip to main content

stygian_plugin/adapters/
extraction_engine.rs

1//! Extraction engine: core adapter that executes extractions against HTML
2//!
3//! Uses scraper (CSS selectors) to extract and transform data according to templates.
4
5use crate::domain::{ExtractionRequest, ExtractionResult, RegionStatus};
6use crate::error::PluginError;
7use crate::{Result, ports::PluginExtractionPort};
8use async_trait::async_trait;
9use scraper::{Html, Selector as ScraperSelector};
10use serde::Serialize;
11use std::collections::HashMap;
12use std::time::Instant;
13
14/// Extraction engine: executes templates against HTML
15///
16/// Uses the `scraper` crate to evaluate CSS selectors against HTML,
17/// applies transformations, and builds structured results.
18pub struct ExtractionEngine;
19
20#[derive(Debug, Clone, Serialize)]
21pub struct TransformationDebugStep {
22    pub transformation: String,
23    pub input: String,
24    pub output: Option<String>,
25    pub error: Option<String>,
26}
27
28#[derive(Debug, Clone, Serialize)]
29pub struct RegionDebugInfo {
30    pub selector: String,
31    pub selector_kind: String,
32    pub evaluation_scope: String,
33    pub match_count: usize,
34    pub raw_match_html: Option<String>,
35    pub raw_extracted_value: Option<String>,
36    pub transformation_output_chain: Vec<TransformationDebugStep>,
37    pub final_value: Option<String>,
38    pub error: Option<String>,
39}
40
41#[derive(Debug, Clone, Serialize)]
42pub struct ExtractionDebugInfo {
43    pub evaluation_scope: String,
44    pub root_html_snippet: String,
45    pub regions: HashMap<String, RegionDebugInfo>,
46}
47
48impl ExtractionEngine {
49    /// Execute an extraction request
50    ///
51    /// # Errors
52    ///
53    /// Returns [`crate::error::PluginError::TemplateValidationError`] when
54    /// the embedded template fails validation. Returns
55    /// [`crate::error::PluginError::ExtractionError`] when the request URL
56    /// or HTML payload is empty, or a region selector matches no elements.
57    /// Per-region extraction errors are captured as `RegionStatus::error`
58    /// entries on the returned [`ExtractionResult`] rather than returned
59    /// directly — only the first hard failure short-circuits the result.
60    pub fn execute(request: &ExtractionRequest) -> Result<ExtractionResult> {
61        request.validate()?;
62        request.template.validate()?;
63
64        let start = Instant::now();
65        let document = Html::parse_document(&request.html);
66
67        let mut result = ExtractionResult::new(request.idempotency_key);
68        let mut successful_regions = 0;
69
70        for region in &request.template.regions {
71            region.validate()?;
72
73            match execute_region(&document, region) {
74                Ok(extracted_values) => {
75                    let count = extracted_values.len();
76
77                    // For single values, return as-is; for multiple, return array
78                    let result_value = if count == 1 {
79                        serde_json::Value::String(extracted_values.into_iter().next().ok_or_else(
80                            || {
81                                PluginError::ExtractionError(
82                                    "selector matched a single value, but none were extracted"
83                                        .to_string(),
84                                )
85                            },
86                        )?)
87                    } else {
88                        serde_json::Value::Array(
89                            extracted_values
90                                .into_iter()
91                                .map(serde_json::Value::String)
92                                .collect(),
93                        )
94                    };
95
96                    result
97                        .data
98                        .insert(region.name.clone(), result_value.clone());
99                    result.metadata.region_status.insert(
100                        region.name.clone(),
101                        RegionStatus {
102                            success: true,
103                            matched_count: count,
104                            error: None,
105                        },
106                    );
107                    successful_regions += 1;
108                }
109                Err(e) => {
110                    result.metadata.region_status.insert(
111                        region.name.clone(),
112                        RegionStatus {
113                            success: false,
114                            matched_count: 0,
115                            error: Some(e.to_string()),
116                        },
117                    );
118                    result = result.with_error(format!("Region '{}': {}", region.name, e));
119                }
120            }
121        }
122
123        // Calculate success rate
124        if request.template.regions.is_empty() {
125            result.metadata.selector_success_rate = 100.0;
126        } else {
127            let successful = u16::try_from(successful_regions).unwrap_or(u16::MAX);
128            let total = u16::try_from(request.template.regions.len()).unwrap_or(u16::MAX);
129            result.metadata.selector_success_rate =
130                (f32::from(successful) / f32::from(total)) * 100.0;
131        }
132
133        let elapsed = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
134        result = result.set_elapsed_ms(elapsed);
135
136        Ok(result)
137    }
138
139    #[must_use]
140    pub fn diagnose(request: &ExtractionRequest, evaluation_scope: &str) -> ExtractionDebugInfo {
141        let document = Html::parse_document(&request.html);
142        let mut regions = HashMap::new();
143
144        for region in &request.template.regions {
145            regions.insert(
146                region.name.clone(),
147                diagnose_region(&document, region, evaluation_scope),
148            );
149        }
150
151        ExtractionDebugInfo {
152            evaluation_scope: evaluation_scope.to_string(),
153            root_html_snippet: truncate_debug(&request.html, 2_000),
154            regions,
155        }
156    }
157}
158
159/// Execute extraction for a single region
160fn execute_region(document: &Html, region: &crate::domain::Region) -> Result<Vec<String>> {
161    // Check selector type and route accordingly
162    let selector_text = match &region.selector {
163        crate::domain::Selector::XPath(_) => {
164            return Err(crate::error::PluginError::ExtractionError(
165                "XPath selectors are not yet supported. Please use CSS selectors instead."
166                    .to_string(),
167            ));
168        }
169        crate::domain::Selector::Css(css) | crate::domain::Selector::Both { css, .. } => css,
170    };
171
172    // Parse as CSS selector
173    let selector = ScraperSelector::parse(selector_text).map_err(|e| {
174        crate::error::PluginError::SelectorError {
175            selector: selector_text.clone(),
176            reason: format!("Failed to parse CSS selector: {e:?}"),
177        }
178    })?;
179
180    let mut results = Vec::new();
181
182    // Select all matching elements
183    for element in document.select(&selector) {
184        let text = element.inner_html();
185
186        // Apply transformation chain to the extracted text
187        let transformed =
188            crate::domain::Transformation::apply_chain(&region.transformations, text)?;
189
190        results.push(transformed);
191    }
192
193    if results.is_empty() {
194        return Err(crate::error::PluginError::ExtractionError(format!(
195            "No elements matched CSS selector: {selector_text}"
196        )));
197    }
198
199    Ok(results)
200}
201
202fn diagnose_region(
203    document: &Html,
204    region: &crate::domain::Region,
205    evaluation_scope: &str,
206) -> RegionDebugInfo {
207    let (selector_kind, selector_text) = match &region.selector {
208        crate::domain::Selector::Css(css) => ("css", css.as_str()),
209        crate::domain::Selector::XPath(xpath) => ("xpath", xpath.as_str()),
210        crate::domain::Selector::Both { css, .. } => ("dual", css.as_str()),
211    };
212
213    if matches!(&region.selector, crate::domain::Selector::XPath(_)) {
214        return RegionDebugInfo {
215            selector: selector_text.to_string(),
216            selector_kind: selector_kind.to_string(),
217            evaluation_scope: evaluation_scope.to_string(),
218            match_count: 0,
219            raw_match_html: None,
220            raw_extracted_value: None,
221            transformation_output_chain: Vec::new(),
222            final_value: None,
223            error: Some(
224                "XPath selectors are not yet supported. Please use CSS selectors instead."
225                    .to_string(),
226            ),
227        };
228    }
229
230    let selector = match ScraperSelector::parse(selector_text) {
231        Ok(selector) => selector,
232        Err(error) => {
233            return RegionDebugInfo {
234                selector: selector_text.to_string(),
235                selector_kind: selector_kind.to_string(),
236                evaluation_scope: evaluation_scope.to_string(),
237                match_count: 0,
238                raw_match_html: None,
239                raw_extracted_value: None,
240                transformation_output_chain: Vec::new(),
241                final_value: None,
242                error: Some(format!("Failed to parse CSS selector: {error:?}")),
243            };
244        }
245    };
246
247    let elements: Vec<_> = document.select(&selector).collect();
248    let match_count = elements.len();
249
250    let Some(first_match) = elements.first() else {
251        return RegionDebugInfo {
252            selector: selector_text.to_string(),
253            selector_kind: selector_kind.to_string(),
254            evaluation_scope: evaluation_scope.to_string(),
255            match_count,
256            raw_match_html: None,
257            raw_extracted_value: None,
258            transformation_output_chain: Vec::new(),
259            final_value: None,
260            error: Some(format!("No elements matched CSS selector: {selector_text}")),
261        };
262    };
263
264    let raw_match_html = truncate_debug(&first_match.html(), 800);
265    let raw_extracted_value = first_match.inner_html();
266    let (transformation_output_chain, final_value, error) =
267        trace_transformations(&region.transformations, &raw_extracted_value);
268
269    RegionDebugInfo {
270        selector: selector_text.to_string(),
271        selector_kind: selector_kind.to_string(),
272        evaluation_scope: evaluation_scope.to_string(),
273        match_count,
274        raw_match_html: Some(raw_match_html),
275        raw_extracted_value: Some(truncate_debug(&raw_extracted_value, 800)),
276        transformation_output_chain,
277        final_value: final_value.map(|value| truncate_debug(&value, 800)),
278        error,
279    }
280}
281
282fn trace_transformations(
283    transformations: &[crate::domain::Transformation],
284    raw_value: &str,
285) -> (Vec<TransformationDebugStep>, Option<String>, Option<String>) {
286    let mut current = raw_value.to_string();
287    let mut steps = Vec::with_capacity(transformations.len());
288
289    for transformation in transformations {
290        let input = current.clone();
291        match transformation.apply(&current) {
292            Ok(output) => {
293                steps.push(TransformationDebugStep {
294                    transformation: format!("{transformation:?}"),
295                    input: truncate_debug(&input, 400),
296                    output: Some(truncate_debug(&output, 400)),
297                    error: None,
298                });
299                current = output;
300            }
301            Err(error) => {
302                let error_text = error.to_string();
303                steps.push(TransformationDebugStep {
304                    transformation: format!("{transformation:?}"),
305                    input: truncate_debug(&input, 400),
306                    output: None,
307                    error: Some(error_text.clone()),
308                });
309                return (steps, None, Some(error_text));
310            }
311        }
312    }
313
314    (steps, Some(current), None)
315}
316
317fn truncate_debug(value: &str, max_chars: usize) -> String {
318    let mut truncated = String::new();
319
320    for (index, ch) in value.chars().enumerate() {
321        if index >= max_chars {
322            truncated.push_str("...");
323            break;
324        }
325        truncated.push(ch);
326    }
327
328    truncated
329}
330
331#[async_trait]
332impl PluginExtractionPort for ExtractionEngine {
333    async fn execute(&self, request: &ExtractionRequest) -> Result<ExtractionResult> {
334        Self::execute(request)
335    }
336
337    async fn validate_selector(&self, html: &str, selector_expr: &str) -> Result<(bool, usize)> {
338        let document = Html::parse_document(html);
339
340        ScraperSelector::parse(selector_expr).map_or(Ok((false, 0)), |selector| {
341            let count = document.select(&selector).count();
342            Ok((true, count))
343        })
344    }
345}
346
347#[cfg(test)]
348mod tests {
349    use super::*;
350    use crate::domain::{ExtractionTemplate, Region, Selector, Transformation};
351    use serde_json::{Value, json};
352
353    #[test]
354    fn test_extract_single_element() -> crate::Result<()> {
355        let html = r#"<div><p class="title">Hello World</p></div>"#;
356
357        let region = Region::new("title", Selector::css(".title"), json!({"type": "string"}));
358        let template = ExtractionTemplate::new("Test").with_region(region);
359
360        let request = ExtractionRequest::new(template, "http://example.com", html);
361        let result = ExtractionEngine::execute(&request)?;
362
363        assert!(result.is_fully_successful());
364        assert_eq!(
365            result.data.get("title"),
366            Some(&serde_json::json!("Hello World"))
367        );
368        Ok(())
369    }
370
371    #[test]
372    fn test_extract_multiple_elements() -> crate::Result<()> {
373        let html = r#"
374            <div>
375                <p class="item">Item 1</p>
376                <p class="item">Item 2</p>
377                <p class="item">Item 3</p>
378            </div>
379        "#;
380
381        let region = Region::new("items", Selector::css(".item"), json!({"type": "array"}));
382        let template = ExtractionTemplate::new("Test").with_region(region);
383
384        let request = ExtractionRequest::new(template, "http://example.com", html);
385        let result = ExtractionEngine::execute(&request)?;
386
387        let items_len = result
388            .data
389            .get("items")
390            .and_then(Value::as_array)
391            .map(std::vec::Vec::len);
392        assert_eq!(items_len, Some(3));
393        Ok(())
394    }
395
396    #[test]
397    fn test_extract_with_transformation() -> crate::Result<()> {
398        let html = r#"<div><p class="price">  $19.99  </p></div>"#;
399
400        let region = Region::new("price", Selector::css(".price"), json!({"type": "string"}))
401            .with_transformation(Transformation::Trim);
402        let template = ExtractionTemplate::new("Test").with_region(region);
403
404        let request = ExtractionRequest::new(template, "http://example.com", html);
405        let result = ExtractionEngine::execute(&request)?;
406
407        assert_eq!(result.data.get("price"), Some(&serde_json::json!("$19.99")));
408        Ok(())
409    }
410
411    #[tokio::test]
412    async fn test_selector_validation() -> crate::Result<()> {
413        let html = r#"<div><p class="test">Content</p></div>"#;
414        let engine = ExtractionEngine;
415
416        let (valid, count) = engine.validate_selector(html, ".test").await?;
417        assert!(valid);
418        assert_eq!(count, 1);
419
420        let (valid, count) = engine.validate_selector(html, ".nonexistent").await?;
421        assert!(valid);
422        assert_eq!(count, 0);
423        Ok(())
424    }
425
426    #[tokio::test]
427    async fn test_invalid_css_selector() -> crate::Result<()> {
428        let html = "<div><p>Content</p></div>";
429        let engine = ExtractionEngine;
430
431        let (valid, _) = engine.validate_selector(html, ">>>invalid").await?;
432        assert!(!valid);
433        Ok(())
434    }
435
436    #[tokio::test]
437    async fn test_supported_css_selector_features() -> crate::Result<()> {
438        let html = r#"
439            <table>
440                <tr data-testid="person-row">
441                    <td>skip</td>
442                    <td><span class="name">Ada Lovelace</span></td>
443                    <td data-testid="name-cell"><span class="title">Founder</span></td>
444                </tr>
445            </table>
446        "#;
447        let engine = ExtractionEngine;
448
449        let (valid, count) = engine
450            .validate_selector(
451                html,
452                "td:nth-child(2), [data-testid*='name'] .title, tr[data-testid='person-row'] .name",
453            )
454            .await?;
455
456        assert!(valid);
457        assert_eq!(count, 3);
458        Ok(())
459    }
460
461    #[test]
462    fn test_diagnostics_capture_match_and_transformations() {
463        let html = r#"<div><span class="name">  Ada Lovelace  </span></div>"#;
464        let region = Region::new(
465            "full_name",
466            Selector::css(".name"),
467            json!({"type": "string"}),
468        )
469        .with_transformation(Transformation::Trim)
470        .with_transformation(Transformation::Uppercase);
471        let template = ExtractionTemplate::new("Debug Test").with_region(region);
472        let request = ExtractionRequest::new(template, "http://example.com", html);
473
474        let diagnostics = ExtractionEngine::diagnose(&request, "document");
475        let region = diagnostics.regions.get("full_name");
476
477        assert!(region.is_some());
478        assert_eq!(region.map(|value| value.match_count), Some(1));
479        assert_eq!(
480            region.and_then(|value| value.final_value.as_deref()),
481            Some("ADA LOVELACE"),
482        );
483        assert_eq!(
484            region.map(|value| value.transformation_output_chain.len()),
485            Some(2),
486        );
487        assert!(
488            region
489                .and_then(|value| value.raw_match_html.as_deref())
490                .is_some_and(|value| value.contains("Ada Lovelace"))
491        );
492    }
493}