1use scraper::{Html, Selector as ScraperSelector};
6use serde_json::{Value, json};
7use std::sync::Arc;
8use uuid::Uuid;
9
10use crate::{
11 ExtractionRequest, PluginError, Result,
12 adapters::ExtractionEngine,
13 domain::{ExtractionTemplate, Region, Selector, Transformation},
14 ports::{IdempotencyKeyStore, PluginExtractionPort, PluginTemplateStore},
15 storage::{FileTemplateStore, MemoryIdempotencyStore},
16};
17
18const SUPPORTED_TRANSFORMATIONS: &str = "Trim, Lowercase, Uppercase, RemoveWhitespace, NormalizeWhitespace, StripHtml, DecodeHtml, ParseJson, Regex:pattern/replacement, RegexExtract:pattern/group, Coerce:type, Filter:pattern";
19
20#[allow(dead_code)]
22pub struct McpPluginServer {
23 template_store: Arc<dyn PluginTemplateStore>,
24 extraction_engine: Arc<dyn PluginExtractionPort>,
25 idempotency_store: Arc<dyn IdempotencyKeyStore>,
26}
27
28impl McpPluginServer {
29 pub fn new_with_file_storage(templates_dir: std::path::PathBuf) -> Self {
31 Self {
32 template_store: Arc::new(FileTemplateStore::new(templates_dir)),
33 extraction_engine: Arc::new(ExtractionEngine),
34 idempotency_store: Arc::new(MemoryIdempotencyStore::new()),
35 }
36 }
37
38 pub fn with_adapters(
40 template_store: Arc<dyn PluginTemplateStore>,
41 extraction_engine: Arc<dyn PluginExtractionPort>,
42 idempotency_store: Arc<dyn IdempotencyKeyStore>,
43 ) -> Self {
44 Self {
45 template_store,
46 extraction_engine,
47 idempotency_store,
48 }
49 }
50
51 fn tools_template_management() -> [Value; 3] {
52 [
53 json!({
54 "name": "plugin_create_template",
55 "description": "Create a new extraction template with the given name and optional description. Returns the template UUID.",
56 "inputSchema": {
57 "type": "object",
58 "properties": {
59 "name": { "type": "string", "description": "Template name (e.g., 'Product Listings')" },
60 "description": { "type": "string", "description": "Optional template description" },
61 "tags": {
62 "type": "array",
63 "items": { "type": "string" },
64 "description": "Optional tags for organization"
65 }
66 },
67 "required": ["name"]
68 }
69 }),
70 json!({
71 "name": "plugin_list_templates",
72 "description": "List all saved extraction templates with metadata.",
73 "inputSchema": { "type": "object", "properties": {} }
74 }),
75 json!({
76 "name": "plugin_delete_template",
77 "description": "Delete an extraction template permanently.",
78 "inputSchema": {
79 "type": "object",
80 "properties": {
81 "template_id": { "type": "string", "description": "UUID of the template to delete" }
82 },
83 "required": ["template_id"]
84 }
85 }),
86 ]
87 }
88
89 fn tools_extraction() -> [Value; 4] {
90 [
91 json!({
92 "name": "plugin_add_region",
93 "description": "Add an extraction region (named zone) to a template. A region is a named selector with transformations.",
94 "inputSchema": {
95 "type": "object",
96 "properties": {
97 "template_id": { "type": "string", "description": "UUID of the template" },
98 "region_name": { "type": "string", "description": "Unique name for this region (e.g., 'product_title')" },
99 "selector_css": { "type": "string", "description": "Optional CSS selector" },
100 "selector_xpath": { "type": "string", "description": "Optional XPath selector" },
101 "transformations": {
102 "type": "array",
103 "items": { "type": "string" },
104 "description": "Ordered transformations: 'Trim', 'Lowercase', 'Regex:pattern/replace', 'StripHtml', etc."
105 }
106 },
107 "required": ["template_id", "region_name"]
108 }
109 }),
110 json!({
111 "name": "plugin_apply_template",
112 "description": "Apply an extraction template to HTML content. Returns extracted data for each region.",
113 "inputSchema": {
114 "type": "object",
115 "properties": {
116 "template_id": { "type": "string", "description": "UUID of the template to apply" },
117 "html": { "type": "string", "description": "HTML content to extract from" },
118 "url": { "type": "string", "description": "Source URL (for logging/context)" },
119 "debug": { "type": "boolean", "description": "Include per-region selector diagnostics and root HTML snippet." }
120 },
121 "required": ["template_id", "html", "url"]
122 }
123 }),
124 json!({
125 "name": "plugin_get_template",
126 "description": "Retrieve a template's full configuration.",
127 "inputSchema": {
128 "type": "object",
129 "properties": {
130 "template_id": { "type": "string", "description": "UUID of the template" }
131 },
132 "required": ["template_id"]
133 }
134 }),
135 json!({
136 "name": "plugin_extract_batch",
137 "description": "Apply a template to extract multiple instances from a page (e.g., all products).",
138 "inputSchema": {
139 "type": "object",
140 "properties": {
141 "template_id": { "type": "string", "description": "UUID of the template" },
142 "html": { "type": "string", "description": "HTML content" },
143 "url": { "type": "string", "description": "Source URL" },
144 "root_selector": { "type": "string", "description": "CSS selector for parent containers to iterate over" }
145 },
146 "required": ["template_id", "html", "url", "root_selector"]
147 }
148 }),
149 ]
150 }
151
152 fn tools_inspection() -> [Value; 1] {
153 [json!({
154 "name": "plugin_inspect_selector",
155 "description": "Test if a CSS/XPath selector matches elements in HTML. Returns match count and preview.",
156 "inputSchema": {
157 "type": "object",
158 "properties": {
159 "html": { "type": "string", "description": "HTML to test against" },
160 "selector_css": { "type": "string", "description": "CSS selector to test" },
161 "selector_xpath": { "type": "string", "description": "XPath to test as fallback" }
162 },
163 "required": ["html"]
164 }
165 })]
166 }
167
168 pub fn tools_list(&self) -> Vec<Value> {
170 let mut tools = Vec::with_capacity(8);
171 tools.extend(Self::tools_template_management());
172 tools.extend(Self::tools_extraction());
173 tools.extend(Self::tools_inspection());
174 tools
175 }
176
177 pub async fn handle_tool_call(&self, name: &str, args: &Value) -> Value {
179 let result = match name {
180 "plugin_create_template" => self.tool_create_template(args).await,
181 "plugin_add_region" => self.tool_add_region(args).await,
182 "plugin_apply_template" => self.tool_apply_template(args).await,
183 "plugin_list_templates" => self.tool_list_templates(args).await,
184 "plugin_delete_template" => self.tool_delete_template(args).await,
185 "plugin_get_template" => self.tool_get_template(args).await,
186 "plugin_extract_batch" => self.tool_extract_batch(args).await,
187 "plugin_inspect_selector" => self.tool_inspect_selector(args).await,
188 _ => Err(PluginError::TemplateValidationError(format!(
189 "unknown tool: {name}"
190 ))),
191 };
192
193 match result {
194 Ok(data) => {
195 json!({ "content": [{ "type": "text", "text": serde_json::to_string(&data).unwrap_or_default() }] })
196 }
197 Err(e) => {
198 json!({ "content": [{ "type": "text", "text": format!("Error: {}", e) }], "isError": true })
199 }
200 }
201 }
202
203 async fn tool_create_template(&self, args: &Value) -> Result<Value> {
206 let name = args
207 .get("name")
208 .and_then(Value::as_str)
209 .ok_or_else(|| PluginError::TemplateValidationError("missing 'name'".to_string()))?;
210
211 let description = args
212 .get("description")
213 .and_then(Value::as_str)
214 .map(ToString::to_string);
215
216 let tags = args
217 .get("tags")
218 .and_then(Value::as_array)
219 .map(|a| {
220 a.iter()
221 .filter_map(|v| v.as_str().map(ToString::to_string))
222 .collect()
223 })
224 .unwrap_or_default();
225
226 let mut template = ExtractionTemplate::new(name);
227 if let Some(desc) = description {
228 template = template.with_description(desc);
229 }
230 template = template.with_tags(tags);
231
232 self.template_store.save(&template).await?;
233
234 Ok(json!({
235 "template_id": template.id.to_string(),
236 "name": template.name,
237 "created_at": template.metadata.created_at.to_rfc3339(),
238 }))
239 }
240
241 async fn tool_add_region(&self, args: &Value) -> Result<Value> {
242 let template_id = args
243 .get("template_id")
244 .and_then(Value::as_str)
245 .and_then(|s| Uuid::parse_str(s).ok())
246 .ok_or_else(|| {
247 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
248 })?;
249
250 let region_name = args
251 .get("region_name")
252 .and_then(Value::as_str)
253 .map(ToString::to_string)
254 .ok_or_else(|| {
255 PluginError::TemplateValidationError("missing 'region_name'".to_string())
256 })?;
257
258 let selector_css = args
259 .get("selector_css")
260 .and_then(Value::as_str)
261 .map(ToString::to_string);
262 let selector_xpath = args
263 .get("selector_xpath")
264 .and_then(Value::as_str)
265 .map(ToString::to_string);
266
267 let selector = match (selector_css, selector_xpath) {
268 (Some(css), Some(xpath)) => Selector::dual(css, xpath),
269 (Some(css), None) => Selector::css(css),
270 (None, Some(xpath)) => Selector::xpath(xpath),
271 (None, None) => {
272 return Err(PluginError::TemplateValidationError(
273 "must provide either selector_css or selector_xpath".to_string(),
274 ));
275 }
276 };
277
278 let mut template = self.template_store.get(&template_id).await?;
280
281 let mut transformations = Vec::new();
283 if let Some(arr) = args.get("transformations").and_then(Value::as_array) {
284 for (idx, v) in arr.iter().enumerate() {
285 let s = v.as_str().ok_or_else(|| {
286 PluginError::TemplateValidationError(format!(
287 "transformation at index {idx} must be a string"
288 ))
289 })?;
290 let transformation = parse_transformation(s).map_err(|_| {
291 PluginError::TemplateValidationError(format!(
292 "invalid transformation at index {idx}: '{s}'. Supported transformations: {SUPPORTED_TRANSFORMATIONS}"
293 ))
294 })?;
295 transformations.push(transformation);
296 }
297 }
298
299 let mut region = Region::new(®ion_name, selector, json!({"type": "string"}));
301 for t in transformations {
302 region = region.with_transformation(t);
303 }
304
305 template = template.with_region(region);
306 self.template_store.save(&template).await?;
307
308 Ok(json!({
309 "template_id": template.id.to_string(),
310 "region_name": region_name,
311 "regions_count": template.regions.len(),
312 }))
313 }
314
315 async fn tool_apply_template(&self, args: &Value) -> Result<Value> {
316 let template_id = args
317 .get("template_id")
318 .and_then(Value::as_str)
319 .and_then(|s| Uuid::parse_str(s).ok())
320 .ok_or_else(|| {
321 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
322 })?;
323
324 let html = args
325 .get("html")
326 .and_then(Value::as_str)
327 .map(ToString::to_string)
328 .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
329
330 let url = args
331 .get("url")
332 .and_then(Value::as_str)
333 .map(ToString::to_string)
334 .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
335 let debug = args.get("debug").and_then(Value::as_bool).unwrap_or(false);
336
337 let template = self.template_store.get(&template_id).await?;
338 let request = ExtractionRequest::new(template, &url, &html);
339 let result = self.extraction_engine.execute(&request).await?;
340 let debug_payload = debug.then(|| ExtractionEngine::diagnose(&request, "document"));
341
342 Ok(json!({
343 "data": result.data,
344 "metadata": {
345 "regions_successful": result.metadata.region_status.values().filter(|s| s.success).count(),
346 "total_regions": result.metadata.region_status.len(),
347 "elapsed_ms": result.metadata.elapsed_ms,
348 "region_status": result.metadata.region_status,
349 "errors": result.metadata.errors,
350 },
351 "debug": debug_payload,
352 }))
353 }
354
355 async fn tool_list_templates(&self, _args: &Value) -> Result<Value> {
356 let templates = self.template_store.list().await?;
357 let list: Vec<_> = templates
358 .iter()
359 .map(|t| {
360 json!({
361 "id": t.id.to_string(),
362 "name": &t.name,
363 "description": &t.description,
364 "regions": t.regions.len(),
365 "created_at": t.metadata.created_at.to_rfc3339(),
366 "usage_count": t.metadata.usage_count,
367 "tags": &t.metadata.tags,
368 })
369 })
370 .collect();
371
372 Ok(json!({
373 "count": list.len(),
374 "templates": list,
375 }))
376 }
377
378 async fn tool_delete_template(&self, args: &Value) -> Result<Value> {
379 let template_id = args
380 .get("template_id")
381 .and_then(Value::as_str)
382 .and_then(|s| Uuid::parse_str(s).ok())
383 .ok_or_else(|| {
384 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
385 })?;
386
387 self.template_store.delete(&template_id).await?;
388
389 Ok(json!({
390 "deleted": template_id.to_string(),
391 }))
392 }
393
394 async fn tool_get_template(&self, args: &Value) -> Result<Value> {
395 let template_id = args
396 .get("template_id")
397 .and_then(Value::as_str)
398 .and_then(|s| Uuid::parse_str(s).ok())
399 .ok_or_else(|| {
400 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
401 })?;
402
403 let template = self.template_store.get(&template_id).await?;
404
405 Ok(json!({
406 "id": template.id.to_string(),
407 "name": template.name,
408 "description": template.description,
409 "regions": template.regions.iter().map(|r| {
410 json!({
411 "name": r.name,
412 "selector": format!("{:?}", r.selector),
413 "transformations": r.transformations.iter().map(|t| format!("{t:?}")).collect::<Vec<_>>(),
414 })
415 }).collect::<Vec<_>>(),
416 "metadata": {
417 "created_at": template.metadata.created_at.to_rfc3339(),
418 "updated_at": template.metadata.updated_at.to_rfc3339(),
419 "usage_count": template.metadata.usage_count,
420 }
421 }))
422 }
423
424 async fn tool_extract_batch(&self, args: &Value) -> Result<Value> {
425 let template_id = args
426 .get("template_id")
427 .and_then(Value::as_str)
428 .and_then(|s| Uuid::parse_str(s).ok())
429 .ok_or_else(|| {
430 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
431 })?;
432
433 let html = args
434 .get("html")
435 .and_then(Value::as_str)
436 .map(ToString::to_string)
437 .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
438
439 let url = args
440 .get("url")
441 .and_then(Value::as_str)
442 .map(ToString::to_string)
443 .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
444
445 let root_selector_str = args
446 .get("root_selector")
447 .and_then(Value::as_str)
448 .map(ToString::to_string)
449 .ok_or_else(|| {
450 PluginError::TemplateValidationError("missing 'root_selector'".to_string())
451 })?;
452 let debug = args.get("debug").and_then(Value::as_bool).unwrap_or(false);
453
454 let root_selector =
456 ScraperSelector::parse(&root_selector_str).map_err(|_| PluginError::SelectorError {
457 selector: root_selector_str.clone(),
458 reason: "Failed to parse root_selector as CSS selector".to_string(),
459 })?;
460
461 let root_elements: Vec<String> = {
464 let document = Html::parse_document(&html);
465 document
466 .select(&root_selector)
467 .map(|elem| elem.html())
468 .collect()
469 };
470
471 if root_elements.is_empty() {
472 return Err(PluginError::ExtractionError(format!(
473 "root_selector matched no elements: {root_selector_str}"
474 )));
475 }
476
477 let first_root_html = debug.then(|| {
478 root_elements.first().map(|root| {
479 let mut truncated = String::new();
480 for (index, ch) in root.chars().enumerate() {
481 if index >= 2_000 {
482 truncated.push_str("...");
483 break;
484 }
485 truncated.push(ch);
486 }
487 truncated
488 })
489 });
490
491 let template = self.template_store.get(&template_id).await?;
493 let mut results = Vec::new();
494
495 for root_html in root_elements {
496 let request = ExtractionRequest::new(template.clone(), &url, &root_html);
497 match self.extraction_engine.execute(&request).await {
498 Ok(result) => {
499 results.push(json!({
500 "data": result.data,
501 "successful_regions": result.metadata.region_status.values().filter(|s| s.success).count(),
502 }));
503 }
504 Err(e) => {
505 results.push(json!({
507 "error": e.to_string(),
508 "successful_regions": 0,
509 }));
510 }
511 }
512 }
513
514 Ok(json!({
515 "root_selector": root_selector_str,
516 "results": results,
517 "total_matched": results.len(),
518 "successful": results.iter().filter(|r| r.get("data").is_some()).count(),
519 "debug": debug.then(|| json!({
520 "evaluation_scope": "root_fragment",
521 "first_root_html": first_root_html,
522 })),
523 }))
524 }
525
526 async fn tool_inspect_selector(&self, args: &Value) -> Result<Value> {
527 let html = args
528 .get("html")
529 .and_then(Value::as_str)
530 .map(ToString::to_string)
531 .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
532
533 let selector_css = args
534 .get("selector_css")
535 .and_then(Value::as_str)
536 .map(ToString::to_string);
537 let selector_xpath = args
538 .get("selector_xpath")
539 .and_then(Value::as_str)
540 .map(ToString::to_string);
541
542 let selector = match (&selector_css, &selector_xpath) {
543 (Some(css), Some(xpath)) => Selector::dual(css, xpath),
544 (Some(css), None) => Selector::css(css),
545 (None, Some(xpath)) => Selector::xpath(xpath),
546 (None, None) => {
547 return Err(PluginError::TemplateValidationError(
548 "must provide either selector_css or selector_xpath".to_string(),
549 ));
550 }
551 };
552
553 selector.validate()?;
554
555 if let Some(css) = selector_css {
557 let (is_valid, count) = self
558 .extraction_engine
559 .validate_selector(&html, &css)
560 .await?;
561 Ok(json!({
562 "selector": css,
563 "selector_type": "css",
564 "valid": is_valid,
565 "match_count": count,
566 "preview": if count > 0 { "Selector matched elements" } else { "No elements matched" }
567 }))
568 } else if selector_xpath.is_some() {
569 Ok(json!({
571 "selector": selector_xpath,
572 "selector_type": "xpath",
573 "valid": true,
574 "note": "XPath selectors are not yet supported for validation. Please use CSS selectors to test matches."
575 }))
576 } else {
577 Err(PluginError::TemplateValidationError(
578 "No selector provided".to_string(),
579 ))
580 }
581 }
582}
583
584pub(crate) fn parse_transformation(s: &str) -> Result<Transformation> {
587 match s {
588 "Trim" => Ok(Transformation::Trim),
589 "Lowercase" => Ok(Transformation::Lowercase),
590 "Uppercase" => Ok(Transformation::Uppercase),
591 "RemoveWhitespace" => Ok(Transformation::RemoveWhitespace),
592 "NormalizeWhitespace" => Ok(Transformation::NormalizeWhitespace),
593 "StripHtml" => Ok(Transformation::StripHtml),
594 "DecodeHtml" => Ok(Transformation::DecodeHtml),
595 "ParseJson" => Ok(Transformation::ParseJson),
596 s if s.starts_with("RegexExtract:") => s
597 .strip_prefix("RegexExtract:")
598 .and_then(|rest| rest.rsplit_once('/'))
599 .map_or_else(
600 || {
601 Err(PluginError::TemplateValidationError(
602 "RegexExtract format: RegexExtract:pattern/group".to_string(),
603 ))
604 },
605 |(pattern, group_str)| {
606 let group = group_str.parse::<usize>().map_err(|_| {
607 PluginError::TemplateValidationError(
608 "RegexExtract group must be a positive integer".to_string(),
609 )
610 })?;
611 Ok(Transformation::RegexExtract {
612 pattern: pattern.to_string(),
613 group,
614 })
615 },
616 ),
617 s if s.starts_with("Coerce:") => s.strip_prefix("Coerce:").map_or_else(
618 || {
619 Err(PluginError::TemplateValidationError(
620 "Coerce format: Coerce:type".to_string(),
621 ))
622 },
623 |target_type| {
624 Ok(Transformation::Coerce {
625 target_type: target_type.to_string(),
626 })
627 },
628 ),
629 s if s.starts_with("Filter:") => s.strip_prefix("Filter:").map_or_else(
630 || {
631 Err(PluginError::TemplateValidationError(
632 "Filter format: Filter:pattern".to_string(),
633 ))
634 },
635 |pattern| {
636 Ok(Transformation::Filter {
637 pattern: pattern.to_string(),
638 })
639 },
640 ),
641 s if s.starts_with("Regex:") => s
642 .strip_prefix("Regex:")
643 .and_then(|rest| rest.split_once('/'))
644 .map_or_else(
645 || {
646 Err(PluginError::TemplateValidationError(
647 "Regex format: Regex:pattern/replacement".to_string(),
648 ))
649 },
650 |(pattern, replacement)| {
651 Ok(Transformation::Regex {
652 pattern: pattern.to_string(),
653 replacement: replacement.to_string(),
654 })
655 },
656 ),
657 _ => Err(PluginError::TemplateValidationError(format!(
658 "unknown transformation: {s}"
659 ))),
660 }
661}
662
663#[cfg(test)]
664mod tests {
665 use super::*;
666
667 #[test]
668 fn test_parse_transformation() {
669 assert!(parse_transformation("Trim").is_ok());
670 assert!(parse_transformation("Lowercase").is_ok());
671 assert!(parse_transformation("Regex:pattern/replace").is_ok());
672 assert!(parse_transformation("RegexExtract:price:(\\d+\\.\\d+)/1").is_ok());
673 assert!(parse_transformation("Coerce:number").is_ok());
674 assert!(parse_transformation("Filter:^ok$").is_ok());
675 assert!(parse_transformation("Invalid").is_err());
676 }
677}