Skip to main content

stygian_plugin/adapters/
extraction_engine.rs

1//! Extraction engine: core adapter that executes extractions against HTML
2//!
3//! Uses scraper (CSS selectors) to extract and transform data according to templates.
4
5use crate::domain::{ExtractionRequest, ExtractionResult, RegionStatus};
6use crate::error::PluginError;
7use crate::{Result, ports::PluginExtractionPort};
8use async_trait::async_trait;
9use scraper::{Html, Selector as ScraperSelector};
10use serde::Serialize;
11use std::collections::HashMap;
12use std::time::Instant;
13
14/// Extraction engine: executes templates against HTML
15///
16/// Uses the `scraper` crate to evaluate CSS selectors against HTML,
17/// applies transformations, and builds structured results.
18pub struct ExtractionEngine;
19
20#[derive(Debug, Clone, Serialize)]
21pub struct TransformationDebugStep {
22    pub transformation: String,
23    pub input: String,
24    pub output: Option<String>,
25    pub error: Option<String>,
26}
27
28#[derive(Debug, Clone, Serialize)]
29pub struct RegionDebugInfo {
30    pub selector: String,
31    pub selector_kind: String,
32    pub evaluation_scope: String,
33    pub match_count: usize,
34    pub raw_match_html: Option<String>,
35    pub raw_extracted_value: Option<String>,
36    pub transformation_output_chain: Vec<TransformationDebugStep>,
37    pub final_value: Option<String>,
38    pub error: Option<String>,
39}
40
41#[derive(Debug, Clone, Serialize)]
42pub struct ExtractionDebugInfo {
43    pub evaluation_scope: String,
44    pub root_html_snippet: String,
45    pub regions: HashMap<String, RegionDebugInfo>,
46}
47
48impl ExtractionEngine {
49    /// Execute an extraction request
50    pub fn execute(request: &ExtractionRequest) -> Result<ExtractionResult> {
51        request.validate()?;
52        request.template.validate()?;
53
54        let start = Instant::now();
55        let document = Html::parse_document(&request.html);
56
57        let mut result = ExtractionResult::new(request.idempotency_key);
58        let mut successful_regions = 0;
59
60        for region in &request.template.regions {
61            region.validate()?;
62
63            match execute_region(&document, region) {
64                Ok(extracted_values) => {
65                    let count = extracted_values.len();
66
67                    // For single values, return as-is; for multiple, return array
68                    let result_value = if count == 1 {
69                        serde_json::Value::String(extracted_values.into_iter().next().ok_or_else(
70                            || {
71                                PluginError::ExtractionError(
72                                    "selector matched a single value, but none were extracted"
73                                        .to_string(),
74                                )
75                            },
76                        )?)
77                    } else {
78                        serde_json::Value::Array(
79                            extracted_values
80                                .into_iter()
81                                .map(serde_json::Value::String)
82                                .collect(),
83                        )
84                    };
85
86                    result
87                        .data
88                        .insert(region.name.clone(), result_value.clone());
89                    result.metadata.region_status.insert(
90                        region.name.clone(),
91                        RegionStatus {
92                            success: true,
93                            matched_count: count,
94                            error: None,
95                        },
96                    );
97                    successful_regions += 1;
98                }
99                Err(e) => {
100                    result.metadata.region_status.insert(
101                        region.name.clone(),
102                        RegionStatus {
103                            success: false,
104                            matched_count: 0,
105                            error: Some(e.to_string()),
106                        },
107                    );
108                    result = result.with_error(format!("Region '{}': {}", region.name, e));
109                }
110            }
111        }
112
113        // Calculate success rate
114        if request.template.regions.is_empty() {
115            result.metadata.selector_success_rate = 100.0;
116        } else {
117            let successful = u16::try_from(successful_regions).unwrap_or(u16::MAX);
118            let total = u16::try_from(request.template.regions.len()).unwrap_or(u16::MAX);
119            result.metadata.selector_success_rate =
120                (f32::from(successful) / f32::from(total)) * 100.0;
121        }
122
123        let elapsed = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
124        result = result.set_elapsed_ms(elapsed);
125
126        Ok(result)
127    }
128
129    pub fn diagnose(request: &ExtractionRequest, evaluation_scope: &str) -> ExtractionDebugInfo {
130        let document = Html::parse_document(&request.html);
131        let mut regions = HashMap::new();
132
133        for region in &request.template.regions {
134            regions.insert(
135                region.name.clone(),
136                diagnose_region(&document, region, evaluation_scope),
137            );
138        }
139
140        ExtractionDebugInfo {
141            evaluation_scope: evaluation_scope.to_string(),
142            root_html_snippet: truncate_debug(&request.html, 2_000),
143            regions,
144        }
145    }
146}
147
148/// Execute extraction for a single region
149fn execute_region(document: &Html, region: &crate::domain::Region) -> Result<Vec<String>> {
150    // Check selector type and route accordingly
151    let selector_text = match &region.selector {
152        crate::domain::Selector::XPath(_) => {
153            return Err(crate::error::PluginError::ExtractionError(
154                "XPath selectors are not yet supported. Please use CSS selectors instead."
155                    .to_string(),
156            ));
157        }
158        crate::domain::Selector::Css(css) | crate::domain::Selector::Both { css, .. } => css,
159    };
160
161    // Parse as CSS selector
162    let selector = ScraperSelector::parse(selector_text).map_err(|e| {
163        crate::error::PluginError::SelectorError {
164            selector: selector_text.clone(),
165            reason: format!("Failed to parse CSS selector: {e:?}"),
166        }
167    })?;
168
169    let mut results = Vec::new();
170
171    // Select all matching elements
172    for element in document.select(&selector) {
173        let text = element.inner_html();
174
175        // Apply transformation chain to the extracted text
176        let transformed =
177            crate::domain::Transformation::apply_chain(&region.transformations, text)?;
178
179        results.push(transformed);
180    }
181
182    if results.is_empty() {
183        return Err(crate::error::PluginError::ExtractionError(format!(
184            "No elements matched CSS selector: {selector_text}"
185        )));
186    }
187
188    Ok(results)
189}
190
191fn diagnose_region(
192    document: &Html,
193    region: &crate::domain::Region,
194    evaluation_scope: &str,
195) -> RegionDebugInfo {
196    let (selector_kind, selector_text) = match &region.selector {
197        crate::domain::Selector::Css(css) => ("css", css.as_str()),
198        crate::domain::Selector::XPath(xpath) => ("xpath", xpath.as_str()),
199        crate::domain::Selector::Both { css, .. } => ("dual", css.as_str()),
200    };
201
202    if matches!(&region.selector, crate::domain::Selector::XPath(_)) {
203        return RegionDebugInfo {
204            selector: selector_text.to_string(),
205            selector_kind: selector_kind.to_string(),
206            evaluation_scope: evaluation_scope.to_string(),
207            match_count: 0,
208            raw_match_html: None,
209            raw_extracted_value: None,
210            transformation_output_chain: Vec::new(),
211            final_value: None,
212            error: Some(
213                "XPath selectors are not yet supported. Please use CSS selectors instead."
214                    .to_string(),
215            ),
216        };
217    }
218
219    let selector = match ScraperSelector::parse(selector_text) {
220        Ok(selector) => selector,
221        Err(error) => {
222            return RegionDebugInfo {
223                selector: selector_text.to_string(),
224                selector_kind: selector_kind.to_string(),
225                evaluation_scope: evaluation_scope.to_string(),
226                match_count: 0,
227                raw_match_html: None,
228                raw_extracted_value: None,
229                transformation_output_chain: Vec::new(),
230                final_value: None,
231                error: Some(format!("Failed to parse CSS selector: {error:?}")),
232            };
233        }
234    };
235
236    let elements: Vec<_> = document.select(&selector).collect();
237    let match_count = elements.len();
238
239    let Some(first_match) = elements.first() else {
240        return RegionDebugInfo {
241            selector: selector_text.to_string(),
242            selector_kind: selector_kind.to_string(),
243            evaluation_scope: evaluation_scope.to_string(),
244            match_count,
245            raw_match_html: None,
246            raw_extracted_value: None,
247            transformation_output_chain: Vec::new(),
248            final_value: None,
249            error: Some(format!("No elements matched CSS selector: {selector_text}")),
250        };
251    };
252
253    let raw_match_html = truncate_debug(&first_match.html(), 800);
254    let raw_extracted_value = first_match.inner_html();
255    let (transformation_output_chain, final_value, error) =
256        trace_transformations(&region.transformations, &raw_extracted_value);
257
258    RegionDebugInfo {
259        selector: selector_text.to_string(),
260        selector_kind: selector_kind.to_string(),
261        evaluation_scope: evaluation_scope.to_string(),
262        match_count,
263        raw_match_html: Some(raw_match_html),
264        raw_extracted_value: Some(truncate_debug(&raw_extracted_value, 800)),
265        transformation_output_chain,
266        final_value: final_value.map(|value| truncate_debug(&value, 800)),
267        error,
268    }
269}
270
271fn trace_transformations(
272    transformations: &[crate::domain::Transformation],
273    raw_value: &str,
274) -> (Vec<TransformationDebugStep>, Option<String>, Option<String>) {
275    let mut current = raw_value.to_string();
276    let mut steps = Vec::with_capacity(transformations.len());
277
278    for transformation in transformations {
279        let input = current.clone();
280        match transformation.apply(&current) {
281            Ok(output) => {
282                steps.push(TransformationDebugStep {
283                    transformation: format!("{transformation:?}"),
284                    input: truncate_debug(&input, 400),
285                    output: Some(truncate_debug(&output, 400)),
286                    error: None,
287                });
288                current = output;
289            }
290            Err(error) => {
291                let error_text = error.to_string();
292                steps.push(TransformationDebugStep {
293                    transformation: format!("{transformation:?}"),
294                    input: truncate_debug(&input, 400),
295                    output: None,
296                    error: Some(error_text.clone()),
297                });
298                return (steps, None, Some(error_text));
299            }
300        }
301    }
302
303    (steps, Some(current), None)
304}
305
306fn truncate_debug(value: &str, max_chars: usize) -> String {
307    let mut truncated = String::new();
308
309    for (index, ch) in value.chars().enumerate() {
310        if index >= max_chars {
311            truncated.push_str("...");
312            break;
313        }
314        truncated.push(ch);
315    }
316
317    truncated
318}
319
320#[async_trait]
321impl PluginExtractionPort for ExtractionEngine {
322    async fn execute(&self, request: &ExtractionRequest) -> Result<ExtractionResult> {
323        Self::execute(request)
324    }
325
326    async fn validate_selector(&self, html: &str, selector_expr: &str) -> Result<(bool, usize)> {
327        let document = Html::parse_document(html);
328
329        ScraperSelector::parse(selector_expr).map_or(Ok((false, 0)), |selector| {
330            let count = document.select(&selector).count();
331            Ok((true, count))
332        })
333    }
334}
335
336#[cfg(test)]
337mod tests {
338    use super::*;
339    use crate::domain::{ExtractionTemplate, Region, Selector, Transformation};
340    use serde_json::{Value, json};
341
342    #[test]
343    fn test_extract_single_element() -> crate::Result<()> {
344        let html = r#"<div><p class="title">Hello World</p></div>"#;
345
346        let region = Region::new("title", Selector::css(".title"), json!({"type": "string"}));
347        let template = ExtractionTemplate::new("Test").with_region(region);
348
349        let request = ExtractionRequest::new(template, "http://example.com", html);
350        let result = ExtractionEngine::execute(&request)?;
351
352        assert!(result.is_fully_successful());
353        assert_eq!(
354            result.data.get("title"),
355            Some(&serde_json::json!("Hello World"))
356        );
357        Ok(())
358    }
359
360    #[test]
361    fn test_extract_multiple_elements() -> crate::Result<()> {
362        let html = r#"
363            <div>
364                <p class="item">Item 1</p>
365                <p class="item">Item 2</p>
366                <p class="item">Item 3</p>
367            </div>
368        "#;
369
370        let region = Region::new("items", Selector::css(".item"), json!({"type": "array"}));
371        let template = ExtractionTemplate::new("Test").with_region(region);
372
373        let request = ExtractionRequest::new(template, "http://example.com", html);
374        let result = ExtractionEngine::execute(&request)?;
375
376        let items_len = result
377            .data
378            .get("items")
379            .and_then(Value::as_array)
380            .map(std::vec::Vec::len);
381        assert_eq!(items_len, Some(3));
382        Ok(())
383    }
384
385    #[test]
386    fn test_extract_with_transformation() -> crate::Result<()> {
387        let html = r#"<div><p class="price">  $19.99  </p></div>"#;
388
389        let region = Region::new("price", Selector::css(".price"), json!({"type": "string"}))
390            .with_transformation(Transformation::Trim);
391        let template = ExtractionTemplate::new("Test").with_region(region);
392
393        let request = ExtractionRequest::new(template, "http://example.com", html);
394        let result = ExtractionEngine::execute(&request)?;
395
396        assert_eq!(result.data.get("price"), Some(&serde_json::json!("$19.99")));
397        Ok(())
398    }
399
400    #[tokio::test]
401    async fn test_selector_validation() -> crate::Result<()> {
402        let html = r#"<div><p class="test">Content</p></div>"#;
403        let engine = ExtractionEngine;
404
405        let (valid, count) = engine.validate_selector(html, ".test").await?;
406        assert!(valid);
407        assert_eq!(count, 1);
408
409        let (valid, count) = engine.validate_selector(html, ".nonexistent").await?;
410        assert!(valid);
411        assert_eq!(count, 0);
412        Ok(())
413    }
414
415    #[tokio::test]
416    async fn test_invalid_css_selector() -> crate::Result<()> {
417        let html = "<div><p>Content</p></div>";
418        let engine = ExtractionEngine;
419
420        let (valid, _) = engine.validate_selector(html, ">>>invalid").await?;
421        assert!(!valid);
422        Ok(())
423    }
424
425    #[tokio::test]
426    async fn test_supported_css_selector_features() -> crate::Result<()> {
427        let html = r#"
428            <table>
429                <tr data-testid="person-row">
430                    <td>skip</td>
431                    <td><span class="name">Ada Lovelace</span></td>
432                    <td data-testid="name-cell"><span class="title">Founder</span></td>
433                </tr>
434            </table>
435        "#;
436        let engine = ExtractionEngine;
437
438        let (valid, count) = engine
439            .validate_selector(
440                html,
441                "td:nth-child(2), [data-testid*='name'] .title, tr[data-testid='person-row'] .name",
442            )
443            .await?;
444
445        assert!(valid);
446        assert_eq!(count, 3);
447        Ok(())
448    }
449
450    #[test]
451    fn test_diagnostics_capture_match_and_transformations() {
452        let html = r#"<div><span class="name">  Ada Lovelace  </span></div>"#;
453        let region = Region::new(
454            "full_name",
455            Selector::css(".name"),
456            json!({"type": "string"}),
457        )
458        .with_transformation(Transformation::Trim)
459        .with_transformation(Transformation::Uppercase);
460        let template = ExtractionTemplate::new("Debug Test").with_region(region);
461        let request = ExtractionRequest::new(template, "http://example.com", html);
462
463        let diagnostics = ExtractionEngine::diagnose(&request, "document");
464        let region = diagnostics.regions.get("full_name");
465
466        assert!(region.is_some());
467        assert_eq!(region.map(|value| value.match_count), Some(1));
468        assert_eq!(
469            region.and_then(|value| value.final_value.as_deref()),
470            Some("ADA LOVELACE"),
471        );
472        assert_eq!(
473            region.map(|value| value.transformation_output_chain.len()),
474            Some(2),
475        );
476        assert!(
477            region
478                .and_then(|value| value.raw_match_html.as_deref())
479                .is_some_and(|value| value.contains("Ada Lovelace"))
480        );
481    }
482}