1use scraper::{Html, Selector as ScraperSelector};
6use serde_json::{Value, json};
7use std::sync::Arc;
8use uuid::Uuid;
9
10use crate::{
11 ExtractionRequest, PluginError, Result,
12 adapters::ExtractionEngine,
13 domain::{ExtractionTemplate, Region, Selector, Transformation},
14 ports::{IdempotencyKeyStore, PluginExtractionPort, PluginTemplateStore},
15 storage::{FileTemplateStore, MemoryIdempotencyStore},
16};
17
18const SUPPORTED_TRANSFORMATIONS: &str = "Trim, Lowercase, Uppercase, RemoveWhitespace, NormalizeWhitespace, StripHtml, DecodeHtml, ParseJson, Regex:pattern/replacement, RegexExtract:pattern/group, Coerce:type, Filter:pattern";
19
20#[allow(dead_code)]
22pub struct McpPluginServer {
23 template_store: Arc<dyn PluginTemplateStore>,
24 extraction_engine: Arc<dyn PluginExtractionPort>,
25 idempotency_store: Arc<dyn IdempotencyKeyStore>,
26}
27
28impl McpPluginServer {
29 #[must_use]
31 pub fn new_with_file_storage(templates_dir: std::path::PathBuf) -> Self {
32 Self {
33 template_store: Arc::new(FileTemplateStore::new(templates_dir)),
34 extraction_engine: Arc::new(ExtractionEngine),
35 idempotency_store: Arc::new(MemoryIdempotencyStore::new()),
36 }
37 }
38
39 pub fn with_adapters(
41 template_store: Arc<dyn PluginTemplateStore>,
42 extraction_engine: Arc<dyn PluginExtractionPort>,
43 idempotency_store: Arc<dyn IdempotencyKeyStore>,
44 ) -> Self {
45 Self {
46 template_store,
47 extraction_engine,
48 idempotency_store,
49 }
50 }
51
52 fn tools_template_management() -> [Value; 3] {
53 [
54 json!({
55 "name": "plugin_create_template",
56 "description": "Create a new extraction template with the given name and optional description. Returns the template UUID.",
57 "inputSchema": {
58 "type": "object",
59 "properties": {
60 "name": { "type": "string", "description": "Template name (e.g., 'Product Listings')" },
61 "description": { "type": "string", "description": "Optional template description" },
62 "tags": {
63 "type": "array",
64 "items": { "type": "string" },
65 "description": "Optional tags for organization"
66 }
67 },
68 "required": ["name"]
69 }
70 }),
71 json!({
72 "name": "plugin_list_templates",
73 "description": "List all saved extraction templates with metadata.",
74 "inputSchema": { "type": "object", "properties": {} }
75 }),
76 json!({
77 "name": "plugin_delete_template",
78 "description": "Delete an extraction template permanently.",
79 "inputSchema": {
80 "type": "object",
81 "properties": {
82 "template_id": { "type": "string", "description": "UUID of the template to delete" }
83 },
84 "required": ["template_id"]
85 }
86 }),
87 ]
88 }
89
90 fn tools_extraction() -> [Value; 4] {
91 [
92 json!({
93 "name": "plugin_add_region",
94 "description": "Add an extraction region (named zone) to a template. A region is a named selector with transformations.",
95 "inputSchema": {
96 "type": "object",
97 "properties": {
98 "template_id": { "type": "string", "description": "UUID of the template" },
99 "region_name": { "type": "string", "description": "Unique name for this region (e.g., 'product_title')" },
100 "selector_css": { "type": "string", "description": "Optional CSS selector" },
101 "selector_xpath": { "type": "string", "description": "Optional XPath selector" },
102 "transformations": {
103 "type": "array",
104 "items": { "type": "string" },
105 "description": "Ordered transformations: 'Trim', 'Lowercase', 'Regex:pattern/replace', 'StripHtml', etc."
106 }
107 },
108 "required": ["template_id", "region_name"]
109 }
110 }),
111 json!({
112 "name": "plugin_apply_template",
113 "description": "Apply an extraction template to HTML content. Returns extracted data for each region.",
114 "inputSchema": {
115 "type": "object",
116 "properties": {
117 "template_id": { "type": "string", "description": "UUID of the template to apply" },
118 "html": { "type": "string", "description": "HTML content to extract from" },
119 "url": { "type": "string", "description": "Source URL (for logging/context)" },
120 "debug": { "type": "boolean", "description": "Include per-region selector diagnostics and root HTML snippet." }
121 },
122 "required": ["template_id", "html", "url"]
123 }
124 }),
125 json!({
126 "name": "plugin_get_template",
127 "description": "Retrieve a template's full configuration.",
128 "inputSchema": {
129 "type": "object",
130 "properties": {
131 "template_id": { "type": "string", "description": "UUID of the template" }
132 },
133 "required": ["template_id"]
134 }
135 }),
136 json!({
137 "name": "plugin_extract_batch",
138 "description": "Apply a template to extract multiple instances from a page (e.g., all products).",
139 "inputSchema": {
140 "type": "object",
141 "properties": {
142 "template_id": { "type": "string", "description": "UUID of the template" },
143 "html": { "type": "string", "description": "HTML content" },
144 "url": { "type": "string", "description": "Source URL" },
145 "root_selector": { "type": "string", "description": "CSS selector for parent containers to iterate over" }
146 },
147 "required": ["template_id", "html", "url", "root_selector"]
148 }
149 }),
150 ]
151 }
152
153 fn tools_inspection() -> [Value; 1] {
154 [json!({
155 "name": "plugin_inspect_selector",
156 "description": "Test if a CSS/XPath selector matches elements in HTML. Returns match count and preview.",
157 "inputSchema": {
158 "type": "object",
159 "properties": {
160 "html": { "type": "string", "description": "HTML to test against" },
161 "selector_css": { "type": "string", "description": "CSS selector to test" },
162 "selector_xpath": { "type": "string", "description": "XPath to test as fallback" }
163 },
164 "required": ["html"]
165 }
166 })]
167 }
168
169 #[must_use]
171 pub fn tools_list(&self) -> Vec<Value> {
172 let mut tools = Vec::with_capacity(8);
173 tools.extend(Self::tools_template_management());
174 tools.extend(Self::tools_extraction());
175 tools.extend(Self::tools_inspection());
176 tools
177 }
178
179 pub async fn handle_tool_call(&self, name: &str, args: &Value) -> Value {
181 let result = match name {
182 "plugin_create_template" => self.tool_create_template(args).await,
183 "plugin_add_region" => self.tool_add_region(args).await,
184 "plugin_apply_template" => self.tool_apply_template(args).await,
185 "plugin_list_templates" => self.tool_list_templates(args).await,
186 "plugin_delete_template" => self.tool_delete_template(args).await,
187 "plugin_get_template" => self.tool_get_template(args).await,
188 "plugin_extract_batch" => self.tool_extract_batch(args).await,
189 "plugin_inspect_selector" => self.tool_inspect_selector(args).await,
190 _ => Err(PluginError::TemplateValidationError(format!(
191 "unknown tool: {name}"
192 ))),
193 };
194
195 match result {
196 Ok(data) => {
197 json!({ "content": [{ "type": "text", "text": serde_json::to_string(&data).unwrap_or_default() }] })
198 }
199 Err(e) => {
200 json!({ "content": [{ "type": "text", "text": format!("Error: {}", e) }], "isError": true })
201 }
202 }
203 }
204
205 async fn tool_create_template(&self, args: &Value) -> Result<Value> {
208 let name = args
209 .get("name")
210 .and_then(Value::as_str)
211 .ok_or_else(|| PluginError::TemplateValidationError("missing 'name'".to_string()))?;
212
213 let description = args
214 .get("description")
215 .and_then(Value::as_str)
216 .map(ToString::to_string);
217
218 let tags = args
219 .get("tags")
220 .and_then(Value::as_array)
221 .map(|a| {
222 a.iter()
223 .filter_map(|v| v.as_str().map(ToString::to_string))
224 .collect()
225 })
226 .unwrap_or_default();
227
228 let mut template = ExtractionTemplate::new(name);
229 if let Some(desc) = description {
230 template = template.with_description(desc);
231 }
232 template = template.with_tags(tags);
233
234 self.template_store.save(&template).await?;
235
236 Ok(json!({
237 "template_id": template.id.to_string(),
238 "name": template.name,
239 "created_at": template.metadata.created_at.to_rfc3339(),
240 }))
241 }
242
243 async fn tool_add_region(&self, args: &Value) -> Result<Value> {
244 let template_id = args
245 .get("template_id")
246 .and_then(Value::as_str)
247 .and_then(|s| Uuid::parse_str(s).ok())
248 .ok_or_else(|| {
249 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
250 })?;
251
252 let region_name = args
253 .get("region_name")
254 .and_then(Value::as_str)
255 .map(ToString::to_string)
256 .ok_or_else(|| {
257 PluginError::TemplateValidationError("missing 'region_name'".to_string())
258 })?;
259
260 let selector_css = args
261 .get("selector_css")
262 .and_then(Value::as_str)
263 .map(ToString::to_string);
264 let selector_xpath = args
265 .get("selector_xpath")
266 .and_then(Value::as_str)
267 .map(ToString::to_string);
268
269 let selector = match (selector_css, selector_xpath) {
270 (Some(css), Some(xpath)) => Selector::dual(css, xpath),
271 (Some(css), None) => Selector::css(css),
272 (None, Some(xpath)) => Selector::xpath(xpath),
273 (None, None) => {
274 return Err(PluginError::TemplateValidationError(
275 "must provide either selector_css or selector_xpath".to_string(),
276 ));
277 }
278 };
279
280 let mut template = self.template_store.get(&template_id).await?;
282
283 let mut transformations = Vec::new();
285 if let Some(arr) = args.get("transformations").and_then(Value::as_array) {
286 for (idx, v) in arr.iter().enumerate() {
287 let s = v.as_str().ok_or_else(|| {
288 PluginError::TemplateValidationError(format!(
289 "transformation at index {idx} must be a string"
290 ))
291 })?;
292 let transformation = parse_transformation(s).map_err(|_| {
293 PluginError::TemplateValidationError(format!(
294 "invalid transformation at index {idx}: '{s}'. Supported transformations: {SUPPORTED_TRANSFORMATIONS}"
295 ))
296 })?;
297 transformations.push(transformation);
298 }
299 }
300
301 let mut region = Region::new(®ion_name, selector, json!({"type": "string"}));
303 for t in transformations {
304 region = region.with_transformation(t);
305 }
306
307 template = template.with_region(region);
308 self.template_store.save(&template).await?;
309
310 Ok(json!({
311 "template_id": template.id.to_string(),
312 "region_name": region_name,
313 "regions_count": template.regions.len(),
314 }))
315 }
316
317 async fn tool_apply_template(&self, args: &Value) -> Result<Value> {
318 let template_id = args
319 .get("template_id")
320 .and_then(Value::as_str)
321 .and_then(|s| Uuid::parse_str(s).ok())
322 .ok_or_else(|| {
323 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
324 })?;
325
326 let html = args
327 .get("html")
328 .and_then(Value::as_str)
329 .map(ToString::to_string)
330 .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
331
332 let url = args
333 .get("url")
334 .and_then(Value::as_str)
335 .map(ToString::to_string)
336 .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
337 let debug = args.get("debug").and_then(Value::as_bool).unwrap_or(false);
338
339 let template = self.template_store.get(&template_id).await?;
340 let request = ExtractionRequest::new(template, &url, &html);
341 let result = self.extraction_engine.execute(&request).await?;
342 let debug_payload = debug.then(|| ExtractionEngine::diagnose(&request, "document"));
343
344 Ok(json!({
345 "data": result.data,
346 "metadata": {
347 "regions_successful": result.metadata.region_status.values().filter(|s| s.success).count(),
348 "total_regions": result.metadata.region_status.len(),
349 "elapsed_ms": result.metadata.elapsed_ms,
350 "region_status": result.metadata.region_status,
351 "errors": result.metadata.errors,
352 },
353 "debug": debug_payload,
354 }))
355 }
356
357 async fn tool_list_templates(&self, _args: &Value) -> Result<Value> {
358 let templates = self.template_store.list().await?;
359 let list: Vec<_> = templates
360 .iter()
361 .map(|t| {
362 json!({
363 "id": t.id.to_string(),
364 "name": &t.name,
365 "description": &t.description,
366 "regions": t.regions.len(),
367 "created_at": t.metadata.created_at.to_rfc3339(),
368 "usage_count": t.metadata.usage_count,
369 "tags": &t.metadata.tags,
370 })
371 })
372 .collect();
373
374 Ok(json!({
375 "count": list.len(),
376 "templates": list,
377 }))
378 }
379
380 async fn tool_delete_template(&self, args: &Value) -> Result<Value> {
381 let template_id = args
382 .get("template_id")
383 .and_then(Value::as_str)
384 .and_then(|s| Uuid::parse_str(s).ok())
385 .ok_or_else(|| {
386 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
387 })?;
388
389 self.template_store.delete(&template_id).await?;
390
391 Ok(json!({
392 "deleted": template_id.to_string(),
393 }))
394 }
395
396 async fn tool_get_template(&self, args: &Value) -> Result<Value> {
397 let template_id = args
398 .get("template_id")
399 .and_then(Value::as_str)
400 .and_then(|s| Uuid::parse_str(s).ok())
401 .ok_or_else(|| {
402 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
403 })?;
404
405 let template = self.template_store.get(&template_id).await?;
406
407 Ok(json!({
408 "id": template.id.to_string(),
409 "name": template.name,
410 "description": template.description,
411 "regions": template.regions.iter().map(|r| {
412 json!({
413 "name": r.name,
414 "selector": format!("{:?}", r.selector),
415 "transformations": r.transformations.iter().map(|t| format!("{t:?}")).collect::<Vec<_>>(),
416 })
417 }).collect::<Vec<_>>(),
418 "metadata": {
419 "created_at": template.metadata.created_at.to_rfc3339(),
420 "updated_at": template.metadata.updated_at.to_rfc3339(),
421 "usage_count": template.metadata.usage_count,
422 }
423 }))
424 }
425
426 async fn tool_extract_batch(&self, args: &Value) -> Result<Value> {
427 let template_id = args
428 .get("template_id")
429 .and_then(Value::as_str)
430 .and_then(|s| Uuid::parse_str(s).ok())
431 .ok_or_else(|| {
432 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
433 })?;
434
435 let html = args
436 .get("html")
437 .and_then(Value::as_str)
438 .map(ToString::to_string)
439 .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
440
441 let url = args
442 .get("url")
443 .and_then(Value::as_str)
444 .map(ToString::to_string)
445 .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
446
447 let root_selector_str = args
448 .get("root_selector")
449 .and_then(Value::as_str)
450 .map(ToString::to_string)
451 .ok_or_else(|| {
452 PluginError::TemplateValidationError("missing 'root_selector'".to_string())
453 })?;
454 let debug = args.get("debug").and_then(Value::as_bool).unwrap_or(false);
455
456 let root_selector =
458 ScraperSelector::parse(&root_selector_str).map_err(|_| PluginError::SelectorError {
459 selector: root_selector_str.clone(),
460 reason: "Failed to parse root_selector as CSS selector".to_string(),
461 })?;
462
463 let root_elements: Vec<String> = {
466 let document = Html::parse_document(&html);
467 document
468 .select(&root_selector)
469 .map(|elem| elem.html())
470 .collect()
471 };
472
473 if root_elements.is_empty() {
474 return Err(PluginError::ExtractionError(format!(
475 "root_selector matched no elements: {root_selector_str}"
476 )));
477 }
478
479 let first_root_html = debug.then(|| {
480 root_elements.first().map(|root| {
481 let mut truncated = String::new();
482 for (index, ch) in root.chars().enumerate() {
483 if index >= 2_000 {
484 truncated.push_str("...");
485 break;
486 }
487 truncated.push(ch);
488 }
489 truncated
490 })
491 });
492
493 let template = self.template_store.get(&template_id).await?;
495 let mut results = Vec::new();
496
497 for root_html in root_elements {
498 let request = ExtractionRequest::new(template.clone(), &url, &root_html);
499 match self.extraction_engine.execute(&request).await {
500 Ok(result) => {
501 results.push(json!({
502 "data": result.data,
503 "successful_regions": result.metadata.region_status.values().filter(|s| s.success).count(),
504 }));
505 }
506 Err(e) => {
507 results.push(json!({
509 "error": e.to_string(),
510 "successful_regions": 0,
511 }));
512 }
513 }
514 }
515
516 Ok(json!({
517 "root_selector": root_selector_str,
518 "results": results,
519 "total_matched": results.len(),
520 "successful": results.iter().filter(|r| r.get("data").is_some()).count(),
521 "debug": debug.then(|| json!({
522 "evaluation_scope": "root_fragment",
523 "first_root_html": first_root_html,
524 })),
525 }))
526 }
527
528 async fn tool_inspect_selector(&self, args: &Value) -> Result<Value> {
529 let html = args
530 .get("html")
531 .and_then(Value::as_str)
532 .map(ToString::to_string)
533 .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
534
535 let selector_css = args
536 .get("selector_css")
537 .and_then(Value::as_str)
538 .map(ToString::to_string);
539 let selector_xpath = args
540 .get("selector_xpath")
541 .and_then(Value::as_str)
542 .map(ToString::to_string);
543
544 let selector = match (&selector_css, &selector_xpath) {
545 (Some(css), Some(xpath)) => Selector::dual(css, xpath),
546 (Some(css), None) => Selector::css(css),
547 (None, Some(xpath)) => Selector::xpath(xpath),
548 (None, None) => {
549 return Err(PluginError::TemplateValidationError(
550 "must provide either selector_css or selector_xpath".to_string(),
551 ));
552 }
553 };
554
555 selector.validate()?;
556
557 if let Some(css) = selector_css {
559 let (is_valid, count) = self
560 .extraction_engine
561 .validate_selector(&html, &css)
562 .await?;
563 Ok(json!({
564 "selector": css,
565 "selector_type": "css",
566 "valid": is_valid,
567 "match_count": count,
568 "preview": if count > 0 { "Selector matched elements" } else { "No elements matched" }
569 }))
570 } else if selector_xpath.is_some() {
571 Ok(json!({
573 "selector": selector_xpath,
574 "selector_type": "xpath",
575 "valid": true,
576 "note": "XPath selectors are not yet supported for validation. Please use CSS selectors to test matches."
577 }))
578 } else {
579 Err(PluginError::TemplateValidationError(
580 "No selector provided".to_string(),
581 ))
582 }
583 }
584}
585
586pub(crate) fn parse_transformation(s: &str) -> Result<Transformation> {
589 match s {
590 "Trim" => Ok(Transformation::Trim),
591 "Lowercase" => Ok(Transformation::Lowercase),
592 "Uppercase" => Ok(Transformation::Uppercase),
593 "RemoveWhitespace" => Ok(Transformation::RemoveWhitespace),
594 "NormalizeWhitespace" => Ok(Transformation::NormalizeWhitespace),
595 "StripHtml" => Ok(Transformation::StripHtml),
596 "DecodeHtml" => Ok(Transformation::DecodeHtml),
597 "ParseJson" => Ok(Transformation::ParseJson),
598 s if s.starts_with("RegexExtract:") => s
599 .strip_prefix("RegexExtract:")
600 .and_then(|rest| rest.rsplit_once('/'))
601 .map_or_else(
602 || {
603 Err(PluginError::TemplateValidationError(
604 "RegexExtract format: RegexExtract:pattern/group".to_string(),
605 ))
606 },
607 |(pattern, group_str)| {
608 let group = group_str.parse::<usize>().map_err(|_| {
609 PluginError::TemplateValidationError(
610 "RegexExtract group must be a positive integer".to_string(),
611 )
612 })?;
613 Ok(Transformation::RegexExtract {
614 pattern: pattern.to_string(),
615 group,
616 })
617 },
618 ),
619 s if s.starts_with("Coerce:") => s.strip_prefix("Coerce:").map_or_else(
620 || {
621 Err(PluginError::TemplateValidationError(
622 "Coerce format: Coerce:type".to_string(),
623 ))
624 },
625 |target_type| {
626 Ok(Transformation::Coerce {
627 target_type: target_type.to_string(),
628 })
629 },
630 ),
631 s if s.starts_with("Filter:") => s.strip_prefix("Filter:").map_or_else(
632 || {
633 Err(PluginError::TemplateValidationError(
634 "Filter format: Filter:pattern".to_string(),
635 ))
636 },
637 |pattern| {
638 Ok(Transformation::Filter {
639 pattern: pattern.to_string(),
640 })
641 },
642 ),
643 s if s.starts_with("Regex:") => s
644 .strip_prefix("Regex:")
645 .and_then(|rest| rest.split_once('/'))
646 .map_or_else(
647 || {
648 Err(PluginError::TemplateValidationError(
649 "Regex format: Regex:pattern/replacement".to_string(),
650 ))
651 },
652 |(pattern, replacement)| {
653 Ok(Transformation::Regex {
654 pattern: pattern.to_string(),
655 replacement: replacement.to_string(),
656 })
657 },
658 ),
659 _ => Err(PluginError::TemplateValidationError(format!(
660 "unknown transformation: {s}"
661 ))),
662 }
663}
664
665#[cfg(test)]
666mod tests {
667 use super::*;
668
669 #[test]
670 fn test_parse_transformation() {
671 assert!(parse_transformation("Trim").is_ok());
672 assert!(parse_transformation("Lowercase").is_ok());
673 assert!(parse_transformation("Regex:pattern/replace").is_ok());
674 assert!(parse_transformation("RegexExtract:price:(\\d+\\.\\d+)/1").is_ok());
675 assert!(parse_transformation("Coerce:number").is_ok());
676 assert!(parse_transformation("Filter:^ok$").is_ok());
677 assert!(parse_transformation("Invalid").is_err());
678 }
679}