1use crate::domain::{ExtractionRequest, ExtractionResult, RegionStatus};
6use crate::error::PluginError;
7use crate::{Result, ports::PluginExtractionPort};
8use async_trait::async_trait;
9use scraper::{Html, Selector as ScraperSelector};
10use serde::Serialize;
11use std::collections::HashMap;
12use std::time::Instant;
13
14pub struct ExtractionEngine;
19
20#[derive(Debug, Clone, Serialize)]
21pub struct TransformationDebugStep {
22 pub transformation: String,
23 pub input: String,
24 pub output: Option<String>,
25 pub error: Option<String>,
26}
27
28#[derive(Debug, Clone, Serialize)]
29pub struct RegionDebugInfo {
30 pub selector: String,
31 pub selector_kind: String,
32 pub evaluation_scope: String,
33 pub match_count: usize,
34 pub raw_match_html: Option<String>,
35 pub raw_extracted_value: Option<String>,
36 pub transformation_output_chain: Vec<TransformationDebugStep>,
37 pub final_value: Option<String>,
38 pub error: Option<String>,
39}
40
41#[derive(Debug, Clone, Serialize)]
42pub struct ExtractionDebugInfo {
43 pub evaluation_scope: String,
44 pub root_html_snippet: String,
45 pub regions: HashMap<String, RegionDebugInfo>,
46}
47
48impl ExtractionEngine {
49 pub fn execute(request: &ExtractionRequest) -> Result<ExtractionResult> {
51 request.validate()?;
52 request.template.validate()?;
53
54 let start = Instant::now();
55 let document = Html::parse_document(&request.html);
56
57 let mut result = ExtractionResult::new(request.idempotency_key);
58 let mut successful_regions = 0;
59
60 for region in &request.template.regions {
61 region.validate()?;
62
63 match execute_region(&document, region) {
64 Ok(extracted_values) => {
65 let count = extracted_values.len();
66
67 let result_value = if count == 1 {
69 serde_json::Value::String(extracted_values.into_iter().next().ok_or_else(
70 || {
71 PluginError::ExtractionError(
72 "selector matched a single value, but none were extracted"
73 .to_string(),
74 )
75 },
76 )?)
77 } else {
78 serde_json::Value::Array(
79 extracted_values
80 .into_iter()
81 .map(serde_json::Value::String)
82 .collect(),
83 )
84 };
85
86 result
87 .data
88 .insert(region.name.clone(), result_value.clone());
89 result.metadata.region_status.insert(
90 region.name.clone(),
91 RegionStatus {
92 success: true,
93 matched_count: count,
94 error: None,
95 },
96 );
97 successful_regions += 1;
98 }
99 Err(e) => {
100 result.metadata.region_status.insert(
101 region.name.clone(),
102 RegionStatus {
103 success: false,
104 matched_count: 0,
105 error: Some(e.to_string()),
106 },
107 );
108 result = result.with_error(format!("Region '{}': {}", region.name, e));
109 }
110 }
111 }
112
113 if request.template.regions.is_empty() {
115 result.metadata.selector_success_rate = 100.0;
116 } else {
117 let successful = u16::try_from(successful_regions).unwrap_or(u16::MAX);
118 let total = u16::try_from(request.template.regions.len()).unwrap_or(u16::MAX);
119 result.metadata.selector_success_rate =
120 (f32::from(successful) / f32::from(total)) * 100.0;
121 }
122
123 let elapsed = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
124 result = result.set_elapsed_ms(elapsed);
125
126 Ok(result)
127 }
128
129 pub fn diagnose(request: &ExtractionRequest, evaluation_scope: &str) -> ExtractionDebugInfo {
130 let document = Html::parse_document(&request.html);
131 let mut regions = HashMap::new();
132
133 for region in &request.template.regions {
134 regions.insert(
135 region.name.clone(),
136 diagnose_region(&document, region, evaluation_scope),
137 );
138 }
139
140 ExtractionDebugInfo {
141 evaluation_scope: evaluation_scope.to_string(),
142 root_html_snippet: truncate_debug(&request.html, 2_000),
143 regions,
144 }
145 }
146}
147
148fn execute_region(document: &Html, region: &crate::domain::Region) -> Result<Vec<String>> {
150 let selector_text = match ®ion.selector {
152 crate::domain::Selector::XPath(_) => {
153 return Err(crate::error::PluginError::ExtractionError(
154 "XPath selectors are not yet supported. Please use CSS selectors instead."
155 .to_string(),
156 ));
157 }
158 crate::domain::Selector::Css(css) | crate::domain::Selector::Both { css, .. } => css,
159 };
160
161 let selector = ScraperSelector::parse(selector_text).map_err(|e| {
163 crate::error::PluginError::SelectorError {
164 selector: selector_text.clone(),
165 reason: format!("Failed to parse CSS selector: {e:?}"),
166 }
167 })?;
168
169 let mut results = Vec::new();
170
171 for element in document.select(&selector) {
173 let text = element.inner_html();
174
175 let transformed =
177 crate::domain::Transformation::apply_chain(®ion.transformations, text)?;
178
179 results.push(transformed);
180 }
181
182 if results.is_empty() {
183 return Err(crate::error::PluginError::ExtractionError(format!(
184 "No elements matched CSS selector: {selector_text}"
185 )));
186 }
187
188 Ok(results)
189}
190
191fn diagnose_region(
192 document: &Html,
193 region: &crate::domain::Region,
194 evaluation_scope: &str,
195) -> RegionDebugInfo {
196 let (selector_kind, selector_text) = match ®ion.selector {
197 crate::domain::Selector::Css(css) => ("css", css.as_str()),
198 crate::domain::Selector::XPath(xpath) => ("xpath", xpath.as_str()),
199 crate::domain::Selector::Both { css, .. } => ("dual", css.as_str()),
200 };
201
202 if matches!(®ion.selector, crate::domain::Selector::XPath(_)) {
203 return RegionDebugInfo {
204 selector: selector_text.to_string(),
205 selector_kind: selector_kind.to_string(),
206 evaluation_scope: evaluation_scope.to_string(),
207 match_count: 0,
208 raw_match_html: None,
209 raw_extracted_value: None,
210 transformation_output_chain: Vec::new(),
211 final_value: None,
212 error: Some(
213 "XPath selectors are not yet supported. Please use CSS selectors instead."
214 .to_string(),
215 ),
216 };
217 }
218
219 let selector = match ScraperSelector::parse(selector_text) {
220 Ok(selector) => selector,
221 Err(error) => {
222 return RegionDebugInfo {
223 selector: selector_text.to_string(),
224 selector_kind: selector_kind.to_string(),
225 evaluation_scope: evaluation_scope.to_string(),
226 match_count: 0,
227 raw_match_html: None,
228 raw_extracted_value: None,
229 transformation_output_chain: Vec::new(),
230 final_value: None,
231 error: Some(format!("Failed to parse CSS selector: {error:?}")),
232 };
233 }
234 };
235
236 let elements: Vec<_> = document.select(&selector).collect();
237 let match_count = elements.len();
238
239 let Some(first_match) = elements.first() else {
240 return RegionDebugInfo {
241 selector: selector_text.to_string(),
242 selector_kind: selector_kind.to_string(),
243 evaluation_scope: evaluation_scope.to_string(),
244 match_count,
245 raw_match_html: None,
246 raw_extracted_value: None,
247 transformation_output_chain: Vec::new(),
248 final_value: None,
249 error: Some(format!("No elements matched CSS selector: {selector_text}")),
250 };
251 };
252
253 let raw_match_html = truncate_debug(&first_match.html(), 800);
254 let raw_extracted_value = first_match.inner_html();
255 let (transformation_output_chain, final_value, error) =
256 trace_transformations(®ion.transformations, &raw_extracted_value);
257
258 RegionDebugInfo {
259 selector: selector_text.to_string(),
260 selector_kind: selector_kind.to_string(),
261 evaluation_scope: evaluation_scope.to_string(),
262 match_count,
263 raw_match_html: Some(raw_match_html),
264 raw_extracted_value: Some(truncate_debug(&raw_extracted_value, 800)),
265 transformation_output_chain,
266 final_value: final_value.map(|value| truncate_debug(&value, 800)),
267 error,
268 }
269}
270
271fn trace_transformations(
272 transformations: &[crate::domain::Transformation],
273 raw_value: &str,
274) -> (Vec<TransformationDebugStep>, Option<String>, Option<String>) {
275 let mut current = raw_value.to_string();
276 let mut steps = Vec::with_capacity(transformations.len());
277
278 for transformation in transformations {
279 let input = current.clone();
280 match transformation.apply(¤t) {
281 Ok(output) => {
282 steps.push(TransformationDebugStep {
283 transformation: format!("{transformation:?}"),
284 input: truncate_debug(&input, 400),
285 output: Some(truncate_debug(&output, 400)),
286 error: None,
287 });
288 current = output;
289 }
290 Err(error) => {
291 let error_text = error.to_string();
292 steps.push(TransformationDebugStep {
293 transformation: format!("{transformation:?}"),
294 input: truncate_debug(&input, 400),
295 output: None,
296 error: Some(error_text.clone()),
297 });
298 return (steps, None, Some(error_text));
299 }
300 }
301 }
302
303 (steps, Some(current), None)
304}
305
306fn truncate_debug(value: &str, max_chars: usize) -> String {
307 let mut truncated = String::new();
308
309 for (index, ch) in value.chars().enumerate() {
310 if index >= max_chars {
311 truncated.push_str("...");
312 break;
313 }
314 truncated.push(ch);
315 }
316
317 truncated
318}
319
320#[async_trait]
321impl PluginExtractionPort for ExtractionEngine {
322 async fn execute(&self, request: &ExtractionRequest) -> Result<ExtractionResult> {
323 Self::execute(request)
324 }
325
326 async fn validate_selector(&self, html: &str, selector_expr: &str) -> Result<(bool, usize)> {
327 let document = Html::parse_document(html);
328
329 ScraperSelector::parse(selector_expr).map_or(Ok((false, 0)), |selector| {
330 let count = document.select(&selector).count();
331 Ok((true, count))
332 })
333 }
334}
335
336#[cfg(test)]
337mod tests {
338 use super::*;
339 use crate::domain::{ExtractionTemplate, Region, Selector, Transformation};
340 use serde_json::{Value, json};
341
342 #[test]
343 fn test_extract_single_element() -> crate::Result<()> {
344 let html = r#"<div><p class="title">Hello World</p></div>"#;
345
346 let region = Region::new("title", Selector::css(".title"), json!({"type": "string"}));
347 let template = ExtractionTemplate::new("Test").with_region(region);
348
349 let request = ExtractionRequest::new(template, "http://example.com", html);
350 let result = ExtractionEngine::execute(&request)?;
351
352 assert!(result.is_fully_successful());
353 assert_eq!(
354 result.data.get("title"),
355 Some(&serde_json::json!("Hello World"))
356 );
357 Ok(())
358 }
359
360 #[test]
361 fn test_extract_multiple_elements() -> crate::Result<()> {
362 let html = r#"
363 <div>
364 <p class="item">Item 1</p>
365 <p class="item">Item 2</p>
366 <p class="item">Item 3</p>
367 </div>
368 "#;
369
370 let region = Region::new("items", Selector::css(".item"), json!({"type": "array"}));
371 let template = ExtractionTemplate::new("Test").with_region(region);
372
373 let request = ExtractionRequest::new(template, "http://example.com", html);
374 let result = ExtractionEngine::execute(&request)?;
375
376 let items_len = result
377 .data
378 .get("items")
379 .and_then(Value::as_array)
380 .map(std::vec::Vec::len);
381 assert_eq!(items_len, Some(3));
382 Ok(())
383 }
384
385 #[test]
386 fn test_extract_with_transformation() -> crate::Result<()> {
387 let html = r#"<div><p class="price"> $19.99 </p></div>"#;
388
389 let region = Region::new("price", Selector::css(".price"), json!({"type": "string"}))
390 .with_transformation(Transformation::Trim);
391 let template = ExtractionTemplate::new("Test").with_region(region);
392
393 let request = ExtractionRequest::new(template, "http://example.com", html);
394 let result = ExtractionEngine::execute(&request)?;
395
396 assert_eq!(result.data.get("price"), Some(&serde_json::json!("$19.99")));
397 Ok(())
398 }
399
400 #[tokio::test]
401 async fn test_selector_validation() -> crate::Result<()> {
402 let html = r#"<div><p class="test">Content</p></div>"#;
403 let engine = ExtractionEngine;
404
405 let (valid, count) = engine.validate_selector(html, ".test").await?;
406 assert!(valid);
407 assert_eq!(count, 1);
408
409 let (valid, count) = engine.validate_selector(html, ".nonexistent").await?;
410 assert!(valid);
411 assert_eq!(count, 0);
412 Ok(())
413 }
414
415 #[tokio::test]
416 async fn test_invalid_css_selector() -> crate::Result<()> {
417 let html = "<div><p>Content</p></div>";
418 let engine = ExtractionEngine;
419
420 let (valid, _) = engine.validate_selector(html, ">>>invalid").await?;
421 assert!(!valid);
422 Ok(())
423 }
424
425 #[tokio::test]
426 async fn test_supported_css_selector_features() -> crate::Result<()> {
427 let html = r#"
428 <table>
429 <tr data-testid="person-row">
430 <td>skip</td>
431 <td><span class="name">Ada Lovelace</span></td>
432 <td data-testid="name-cell"><span class="title">Founder</span></td>
433 </tr>
434 </table>
435 "#;
436 let engine = ExtractionEngine;
437
438 let (valid, count) = engine
439 .validate_selector(
440 html,
441 "td:nth-child(2), [data-testid*='name'] .title, tr[data-testid='person-row'] .name",
442 )
443 .await?;
444
445 assert!(valid);
446 assert_eq!(count, 3);
447 Ok(())
448 }
449
450 #[test]
451 fn test_diagnostics_capture_match_and_transformations() {
452 let html = r#"<div><span class="name"> Ada Lovelace </span></div>"#;
453 let region = Region::new(
454 "full_name",
455 Selector::css(".name"),
456 json!({"type": "string"}),
457 )
458 .with_transformation(Transformation::Trim)
459 .with_transformation(Transformation::Uppercase);
460 let template = ExtractionTemplate::new("Debug Test").with_region(region);
461 let request = ExtractionRequest::new(template, "http://example.com", html);
462
463 let diagnostics = ExtractionEngine::diagnose(&request, "document");
464 let region = diagnostics.regions.get("full_name");
465
466 assert!(region.is_some());
467 assert_eq!(region.map(|value| value.match_count), Some(1));
468 assert_eq!(
469 region.and_then(|value| value.final_value.as_deref()),
470 Some("ADA LOVELACE"),
471 );
472 assert_eq!(
473 region.map(|value| value.transformation_output_chain.len()),
474 Some(2),
475 );
476 assert!(
477 region
478 .and_then(|value| value.raw_match_html.as_deref())
479 .is_some_and(|value| value.contains("Ada Lovelace"))
480 );
481 }
482}