1use crate::domain::{ExtractionRequest, ExtractionResult, RegionStatus};
6use crate::error::PluginError;
7use crate::{Result, ports::PluginExtractionPort};
8use async_trait::async_trait;
9use scraper::{Html, Selector as ScraperSelector};
10use serde::Serialize;
11use std::collections::HashMap;
12use std::time::Instant;
13
14pub struct ExtractionEngine;
19
20#[derive(Debug, Clone, Serialize)]
21pub struct TransformationDebugStep {
22 pub transformation: String,
23 pub input: String,
24 pub output: Option<String>,
25 pub error: Option<String>,
26}
27
28#[derive(Debug, Clone, Serialize)]
29pub struct RegionDebugInfo {
30 pub selector: String,
31 pub selector_kind: String,
32 pub evaluation_scope: String,
33 pub match_count: usize,
34 pub raw_match_html: Option<String>,
35 pub raw_extracted_value: Option<String>,
36 pub transformation_output_chain: Vec<TransformationDebugStep>,
37 pub final_value: Option<String>,
38 pub error: Option<String>,
39}
40
41#[derive(Debug, Clone, Serialize)]
42pub struct ExtractionDebugInfo {
43 pub evaluation_scope: String,
44 pub root_html_snippet: String,
45 pub regions: HashMap<String, RegionDebugInfo>,
46}
47
48impl ExtractionEngine {
49 pub fn execute(request: &ExtractionRequest) -> Result<ExtractionResult> {
61 request.validate()?;
62 request.template.validate()?;
63
64 let start = Instant::now();
65 let document = Html::parse_document(&request.html);
66
67 let mut result = ExtractionResult::new(request.idempotency_key);
68 let mut successful_regions = 0;
69
70 for region in &request.template.regions {
71 region.validate()?;
72
73 match execute_region(&document, region) {
74 Ok(extracted_values) => {
75 let count = extracted_values.len();
76
77 let result_value = if count == 1 {
79 serde_json::Value::String(extracted_values.into_iter().next().ok_or_else(
80 || {
81 PluginError::ExtractionError(
82 "selector matched a single value, but none were extracted"
83 .to_string(),
84 )
85 },
86 )?)
87 } else {
88 serde_json::Value::Array(
89 extracted_values
90 .into_iter()
91 .map(serde_json::Value::String)
92 .collect(),
93 )
94 };
95
96 result
97 .data
98 .insert(region.name.clone(), result_value.clone());
99 result.metadata.region_status.insert(
100 region.name.clone(),
101 RegionStatus {
102 success: true,
103 matched_count: count,
104 error: None,
105 },
106 );
107 successful_regions += 1;
108 }
109 Err(e) => {
110 result.metadata.region_status.insert(
111 region.name.clone(),
112 RegionStatus {
113 success: false,
114 matched_count: 0,
115 error: Some(e.to_string()),
116 },
117 );
118 result = result.with_error(format!("Region '{}': {}", region.name, e));
119 }
120 }
121 }
122
123 if request.template.regions.is_empty() {
125 result.metadata.selector_success_rate = 100.0;
126 } else {
127 let successful = u16::try_from(successful_regions).unwrap_or(u16::MAX);
128 let total = u16::try_from(request.template.regions.len()).unwrap_or(u16::MAX);
129 result.metadata.selector_success_rate =
130 (f32::from(successful) / f32::from(total)) * 100.0;
131 }
132
133 let elapsed = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
134 result = result.set_elapsed_ms(elapsed);
135
136 Ok(result)
137 }
138
139 #[must_use]
140 pub fn diagnose(request: &ExtractionRequest, evaluation_scope: &str) -> ExtractionDebugInfo {
141 let document = Html::parse_document(&request.html);
142 let mut regions = HashMap::new();
143
144 for region in &request.template.regions {
145 regions.insert(
146 region.name.clone(),
147 diagnose_region(&document, region, evaluation_scope),
148 );
149 }
150
151 ExtractionDebugInfo {
152 evaluation_scope: evaluation_scope.to_string(),
153 root_html_snippet: truncate_debug(&request.html, 2_000),
154 regions,
155 }
156 }
157}
158
159fn execute_region(document: &Html, region: &crate::domain::Region) -> Result<Vec<String>> {
161 let selector_text = match ®ion.selector {
163 crate::domain::Selector::XPath(_) => {
164 return Err(crate::error::PluginError::ExtractionError(
165 "XPath selectors are not yet supported. Please use CSS selectors instead."
166 .to_string(),
167 ));
168 }
169 crate::domain::Selector::Css(css) | crate::domain::Selector::Both { css, .. } => css,
170 };
171
172 let selector = ScraperSelector::parse(selector_text).map_err(|e| {
174 crate::error::PluginError::SelectorError {
175 selector: selector_text.clone(),
176 reason: format!("Failed to parse CSS selector: {e:?}"),
177 }
178 })?;
179
180 let mut results = Vec::new();
181
182 for element in document.select(&selector) {
184 let text = element.inner_html();
185
186 let transformed =
188 crate::domain::Transformation::apply_chain(®ion.transformations, text)?;
189
190 results.push(transformed);
191 }
192
193 if results.is_empty() {
194 return Err(crate::error::PluginError::ExtractionError(format!(
195 "No elements matched CSS selector: {selector_text}"
196 )));
197 }
198
199 Ok(results)
200}
201
202fn diagnose_region(
203 document: &Html,
204 region: &crate::domain::Region,
205 evaluation_scope: &str,
206) -> RegionDebugInfo {
207 let (selector_kind, selector_text) = match ®ion.selector {
208 crate::domain::Selector::Css(css) => ("css", css.as_str()),
209 crate::domain::Selector::XPath(xpath) => ("xpath", xpath.as_str()),
210 crate::domain::Selector::Both { css, .. } => ("dual", css.as_str()),
211 };
212
213 if matches!(®ion.selector, crate::domain::Selector::XPath(_)) {
214 return RegionDebugInfo {
215 selector: selector_text.to_string(),
216 selector_kind: selector_kind.to_string(),
217 evaluation_scope: evaluation_scope.to_string(),
218 match_count: 0,
219 raw_match_html: None,
220 raw_extracted_value: None,
221 transformation_output_chain: Vec::new(),
222 final_value: None,
223 error: Some(
224 "XPath selectors are not yet supported. Please use CSS selectors instead."
225 .to_string(),
226 ),
227 };
228 }
229
230 let selector = match ScraperSelector::parse(selector_text) {
231 Ok(selector) => selector,
232 Err(error) => {
233 return RegionDebugInfo {
234 selector: selector_text.to_string(),
235 selector_kind: selector_kind.to_string(),
236 evaluation_scope: evaluation_scope.to_string(),
237 match_count: 0,
238 raw_match_html: None,
239 raw_extracted_value: None,
240 transformation_output_chain: Vec::new(),
241 final_value: None,
242 error: Some(format!("Failed to parse CSS selector: {error:?}")),
243 };
244 }
245 };
246
247 let elements: Vec<_> = document.select(&selector).collect();
248 let match_count = elements.len();
249
250 let Some(first_match) = elements.first() else {
251 return RegionDebugInfo {
252 selector: selector_text.to_string(),
253 selector_kind: selector_kind.to_string(),
254 evaluation_scope: evaluation_scope.to_string(),
255 match_count,
256 raw_match_html: None,
257 raw_extracted_value: None,
258 transformation_output_chain: Vec::new(),
259 final_value: None,
260 error: Some(format!("No elements matched CSS selector: {selector_text}")),
261 };
262 };
263
264 let raw_match_html = truncate_debug(&first_match.html(), 800);
265 let raw_extracted_value = first_match.inner_html();
266 let (transformation_output_chain, final_value, error) =
267 trace_transformations(®ion.transformations, &raw_extracted_value);
268
269 RegionDebugInfo {
270 selector: selector_text.to_string(),
271 selector_kind: selector_kind.to_string(),
272 evaluation_scope: evaluation_scope.to_string(),
273 match_count,
274 raw_match_html: Some(raw_match_html),
275 raw_extracted_value: Some(truncate_debug(&raw_extracted_value, 800)),
276 transformation_output_chain,
277 final_value: final_value.map(|value| truncate_debug(&value, 800)),
278 error,
279 }
280}
281
282fn trace_transformations(
283 transformations: &[crate::domain::Transformation],
284 raw_value: &str,
285) -> (Vec<TransformationDebugStep>, Option<String>, Option<String>) {
286 let mut current = raw_value.to_string();
287 let mut steps = Vec::with_capacity(transformations.len());
288
289 for transformation in transformations {
290 let input = current.clone();
291 match transformation.apply(¤t) {
292 Ok(output) => {
293 steps.push(TransformationDebugStep {
294 transformation: format!("{transformation:?}"),
295 input: truncate_debug(&input, 400),
296 output: Some(truncate_debug(&output, 400)),
297 error: None,
298 });
299 current = output;
300 }
301 Err(error) => {
302 let error_text = error.to_string();
303 steps.push(TransformationDebugStep {
304 transformation: format!("{transformation:?}"),
305 input: truncate_debug(&input, 400),
306 output: None,
307 error: Some(error_text.clone()),
308 });
309 return (steps, None, Some(error_text));
310 }
311 }
312 }
313
314 (steps, Some(current), None)
315}
316
317fn truncate_debug(value: &str, max_chars: usize) -> String {
318 let mut truncated = String::new();
319
320 for (index, ch) in value.chars().enumerate() {
321 if index >= max_chars {
322 truncated.push_str("...");
323 break;
324 }
325 truncated.push(ch);
326 }
327
328 truncated
329}
330
331#[async_trait]
332impl PluginExtractionPort for ExtractionEngine {
333 async fn execute(&self, request: &ExtractionRequest) -> Result<ExtractionResult> {
334 Self::execute(request)
335 }
336
337 async fn validate_selector(&self, html: &str, selector_expr: &str) -> Result<(bool, usize)> {
338 let document = Html::parse_document(html);
339
340 ScraperSelector::parse(selector_expr).map_or(Ok((false, 0)), |selector| {
341 let count = document.select(&selector).count();
342 Ok((true, count))
343 })
344 }
345}
346
347#[cfg(test)]
348mod tests {
349 use super::*;
350 use crate::domain::{ExtractionTemplate, Region, Selector, Transformation};
351 use serde_json::{Value, json};
352
353 #[test]
354 fn test_extract_single_element() -> crate::Result<()> {
355 let html = r#"<div><p class="title">Hello World</p></div>"#;
356
357 let region = Region::new("title", Selector::css(".title"), json!({"type": "string"}));
358 let template = ExtractionTemplate::new("Test").with_region(region);
359
360 let request = ExtractionRequest::new(template, "http://example.com", html);
361 let result = ExtractionEngine::execute(&request)?;
362
363 assert!(result.is_fully_successful());
364 assert_eq!(
365 result.data.get("title"),
366 Some(&serde_json::json!("Hello World"))
367 );
368 Ok(())
369 }
370
371 #[test]
372 fn test_extract_multiple_elements() -> crate::Result<()> {
373 let html = r#"
374 <div>
375 <p class="item">Item 1</p>
376 <p class="item">Item 2</p>
377 <p class="item">Item 3</p>
378 </div>
379 "#;
380
381 let region = Region::new("items", Selector::css(".item"), json!({"type": "array"}));
382 let template = ExtractionTemplate::new("Test").with_region(region);
383
384 let request = ExtractionRequest::new(template, "http://example.com", html);
385 let result = ExtractionEngine::execute(&request)?;
386
387 let items_len = result
388 .data
389 .get("items")
390 .and_then(Value::as_array)
391 .map(std::vec::Vec::len);
392 assert_eq!(items_len, Some(3));
393 Ok(())
394 }
395
396 #[test]
397 fn test_extract_with_transformation() -> crate::Result<()> {
398 let html = r#"<div><p class="price"> $19.99 </p></div>"#;
399
400 let region = Region::new("price", Selector::css(".price"), json!({"type": "string"}))
401 .with_transformation(Transformation::Trim);
402 let template = ExtractionTemplate::new("Test").with_region(region);
403
404 let request = ExtractionRequest::new(template, "http://example.com", html);
405 let result = ExtractionEngine::execute(&request)?;
406
407 assert_eq!(result.data.get("price"), Some(&serde_json::json!("$19.99")));
408 Ok(())
409 }
410
411 #[tokio::test]
412 async fn test_selector_validation() -> crate::Result<()> {
413 let html = r#"<div><p class="test">Content</p></div>"#;
414 let engine = ExtractionEngine;
415
416 let (valid, count) = engine.validate_selector(html, ".test").await?;
417 assert!(valid);
418 assert_eq!(count, 1);
419
420 let (valid, count) = engine.validate_selector(html, ".nonexistent").await?;
421 assert!(valid);
422 assert_eq!(count, 0);
423 Ok(())
424 }
425
426 #[tokio::test]
427 async fn test_invalid_css_selector() -> crate::Result<()> {
428 let html = "<div><p>Content</p></div>";
429 let engine = ExtractionEngine;
430
431 let (valid, _) = engine.validate_selector(html, ">>>invalid").await?;
432 assert!(!valid);
433 Ok(())
434 }
435
436 #[tokio::test]
437 async fn test_supported_css_selector_features() -> crate::Result<()> {
438 let html = r#"
439 <table>
440 <tr data-testid="person-row">
441 <td>skip</td>
442 <td><span class="name">Ada Lovelace</span></td>
443 <td data-testid="name-cell"><span class="title">Founder</span></td>
444 </tr>
445 </table>
446 "#;
447 let engine = ExtractionEngine;
448
449 let (valid, count) = engine
450 .validate_selector(
451 html,
452 "td:nth-child(2), [data-testid*='name'] .title, tr[data-testid='person-row'] .name",
453 )
454 .await?;
455
456 assert!(valid);
457 assert_eq!(count, 3);
458 Ok(())
459 }
460
461 #[test]
462 fn test_diagnostics_capture_match_and_transformations() {
463 let html = r#"<div><span class="name"> Ada Lovelace </span></div>"#;
464 let region = Region::new(
465 "full_name",
466 Selector::css(".name"),
467 json!({"type": "string"}),
468 )
469 .with_transformation(Transformation::Trim)
470 .with_transformation(Transformation::Uppercase);
471 let template = ExtractionTemplate::new("Debug Test").with_region(region);
472 let request = ExtractionRequest::new(template, "http://example.com", html);
473
474 let diagnostics = ExtractionEngine::diagnose(&request, "document");
475 let region = diagnostics.regions.get("full_name");
476
477 assert!(region.is_some());
478 assert_eq!(region.map(|value| value.match_count), Some(1));
479 assert_eq!(
480 region.and_then(|value| value.final_value.as_deref()),
481 Some("ADA LOVELACE"),
482 );
483 assert_eq!(
484 region.map(|value| value.transformation_output_chain.len()),
485 Some(2),
486 );
487 assert!(
488 region
489 .and_then(|value| value.raw_match_html.as_deref())
490 .is_some_and(|value| value.contains("Ada Lovelace"))
491 );
492 }
493}