stygian_plugin/adapters/
extraction_engine.rs1use crate::domain::{ExtractionRequest, ExtractionResult, RegionStatus};
6use crate::error::PluginError;
7use crate::{Result, ports::PluginExtractionPort};
8use async_trait::async_trait;
9use scraper::{Html, Selector as ScraperSelector};
10use std::time::Instant;
11
12pub struct ExtractionEngine;
17
18impl ExtractionEngine {
19 pub fn execute(request: &ExtractionRequest) -> Result<ExtractionResult> {
21 request.validate()?;
22 request.template.validate()?;
23
24 let start = Instant::now();
25 let document = Html::parse_document(&request.html);
26
27 let mut result = ExtractionResult::new(request.idempotency_key);
28 let mut successful_regions = 0;
29
30 for region in &request.template.regions {
31 region.validate()?;
32
33 match execute_region(&document, region) {
34 Ok(extracted_values) => {
35 let count = extracted_values.len();
36
37 let result_value = if count == 1 {
39 serde_json::Value::String(extracted_values.into_iter().next().ok_or_else(
40 || {
41 PluginError::ExtractionError(
42 "selector matched a single value, but none were extracted"
43 .to_string(),
44 )
45 },
46 )?)
47 } else {
48 serde_json::Value::Array(
49 extracted_values
50 .into_iter()
51 .map(serde_json::Value::String)
52 .collect(),
53 )
54 };
55
56 result
57 .data
58 .insert(region.name.clone(), result_value.clone());
59 result.metadata.region_status.insert(
60 region.name.clone(),
61 RegionStatus {
62 success: true,
63 matched_count: count,
64 error: None,
65 },
66 );
67 successful_regions += 1;
68 }
69 Err(e) => {
70 result.metadata.region_status.insert(
71 region.name.clone(),
72 RegionStatus {
73 success: false,
74 matched_count: 0,
75 error: Some(e.to_string()),
76 },
77 );
78 result = result.with_error(format!("Region '{}': {}", region.name, e));
79 }
80 }
81 }
82
83 if request.template.regions.is_empty() {
85 result.metadata.selector_success_rate = 100.0;
86 } else {
87 let successful = u16::try_from(successful_regions).unwrap_or(u16::MAX);
88 let total = u16::try_from(request.template.regions.len()).unwrap_or(u16::MAX);
89 result.metadata.selector_success_rate =
90 (f32::from(successful) / f32::from(total)) * 100.0;
91 }
92
93 let elapsed = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
94 result = result.set_elapsed_ms(elapsed);
95
96 Ok(result)
97 }
98}
99
100fn execute_region(document: &Html, region: &crate::domain::Region) -> Result<Vec<String>> {
102 let selector_text = match ®ion.selector {
104 crate::domain::Selector::XPath(_) => {
105 return Err(crate::error::PluginError::ExtractionError(
106 "XPath selectors are not yet supported. Please use CSS selectors instead."
107 .to_string(),
108 ));
109 }
110 crate::domain::Selector::Css(css) | crate::domain::Selector::Both { css, .. } => css,
111 };
112
113 let selector = ScraperSelector::parse(selector_text).map_err(|e| {
115 crate::error::PluginError::SelectorError {
116 selector: selector_text.clone(),
117 reason: format!("Failed to parse CSS selector: {e:?}"),
118 }
119 })?;
120
121 let mut results = Vec::new();
122
123 for element in document.select(&selector) {
125 let text = element.inner_html();
126
127 let transformed =
129 crate::domain::Transformation::apply_chain(®ion.transformations, text)?;
130
131 results.push(transformed);
132 }
133
134 if results.is_empty() {
135 return Err(crate::error::PluginError::ExtractionError(format!(
136 "No elements matched CSS selector: {selector_text}"
137 )));
138 }
139
140 Ok(results)
141}
142
143#[async_trait]
144impl PluginExtractionPort for ExtractionEngine {
145 async fn execute(&self, request: &ExtractionRequest) -> Result<ExtractionResult> {
146 Self::execute(request)
147 }
148
149 async fn validate_selector(&self, html: &str, selector_expr: &str) -> Result<(bool, usize)> {
150 let document = Html::parse_document(html);
151
152 ScraperSelector::parse(selector_expr).map_or(Ok((false, 0)), |selector| {
153 let count = document.select(&selector).count();
154 Ok((true, count))
155 })
156 }
157}
158
159#[cfg(test)]
160mod tests {
161 use super::*;
162 use crate::domain::{ExtractionTemplate, Region, Selector, Transformation};
163 use serde_json::{Value, json};
164
165 #[test]
166 fn test_extract_single_element() -> crate::Result<()> {
167 let html = r#"<div><p class="title">Hello World</p></div>"#;
168
169 let region = Region::new("title", Selector::css(".title"), json!({"type": "string"}));
170 let template = ExtractionTemplate::new("Test").with_region(region);
171
172 let request = ExtractionRequest::new(template, "http://example.com", html);
173 let result = ExtractionEngine::execute(&request)?;
174
175 assert!(result.is_fully_successful());
176 assert_eq!(
177 result.data.get("title"),
178 Some(&serde_json::json!("Hello World"))
179 );
180 Ok(())
181 }
182
183 #[test]
184 fn test_extract_multiple_elements() -> crate::Result<()> {
185 let html = r#"
186 <div>
187 <p class="item">Item 1</p>
188 <p class="item">Item 2</p>
189 <p class="item">Item 3</p>
190 </div>
191 "#;
192
193 let region = Region::new("items", Selector::css(".item"), json!({"type": "array"}));
194 let template = ExtractionTemplate::new("Test").with_region(region);
195
196 let request = ExtractionRequest::new(template, "http://example.com", html);
197 let result = ExtractionEngine::execute(&request)?;
198
199 let items_len = result
200 .data
201 .get("items")
202 .and_then(Value::as_array)
203 .map(std::vec::Vec::len);
204 assert_eq!(items_len, Some(3));
205 Ok(())
206 }
207
208 #[test]
209 fn test_extract_with_transformation() -> crate::Result<()> {
210 let html = r#"<div><p class="price"> $19.99 </p></div>"#;
211
212 let region = Region::new("price", Selector::css(".price"), json!({"type": "string"}))
213 .with_transformation(Transformation::Trim);
214 let template = ExtractionTemplate::new("Test").with_region(region);
215
216 let request = ExtractionRequest::new(template, "http://example.com", html);
217 let result = ExtractionEngine::execute(&request)?;
218
219 assert_eq!(result.data.get("price"), Some(&serde_json::json!("$19.99")));
220 Ok(())
221 }
222
223 #[tokio::test]
224 async fn test_selector_validation() -> crate::Result<()> {
225 let html = r#"<div><p class="test">Content</p></div>"#;
226 let engine = ExtractionEngine;
227
228 let (valid, count) = engine.validate_selector(html, ".test").await?;
229 assert!(valid);
230 assert_eq!(count, 1);
231
232 let (valid, count) = engine.validate_selector(html, ".nonexistent").await?;
233 assert!(valid);
234 assert_eq!(count, 0);
235 Ok(())
236 }
237
238 #[tokio::test]
239 async fn test_invalid_css_selector() -> crate::Result<()> {
240 let html = "<div><p>Content</p></div>";
241 let engine = ExtractionEngine;
242
243 let (valid, _) = engine.validate_selector(html, ">>>invalid").await?;
244 assert!(!valid);
245 Ok(())
246 }
247}