1use scraper::{Html, Selector as ScraperSelector};
6use serde_json::{Value, json};
7use std::sync::Arc;
8use uuid::Uuid;
9
10use crate::{
11 ExtractionRequest, PluginError, Result,
12 adapters::ExtractionEngine,
13 domain::{ExtractionTemplate, Region, Selector, Transformation},
14 ports::{IdempotencyKeyStore, PluginExtractionPort, PluginTemplateStore},
15 storage::{FileTemplateStore, MemoryIdempotencyStore},
16};
17
18const SUPPORTED_TRANSFORMATIONS: &str = "Trim, Lowercase, Uppercase, RemoveWhitespace, NormalizeWhitespace, StripHtml, DecodeHtml, ParseJson, Regex:pattern/replacement, RegexExtract:pattern/group, Coerce:type, Filter:pattern";
19
20#[allow(dead_code)]
22pub struct McpPluginServer {
23 template_store: Arc<dyn PluginTemplateStore>,
24 extraction_engine: Arc<dyn PluginExtractionPort>,
25 idempotency_store: Arc<dyn IdempotencyKeyStore>,
26}
27
28impl McpPluginServer {
29 pub fn new_with_file_storage(templates_dir: std::path::PathBuf) -> Self {
31 Self {
32 template_store: Arc::new(FileTemplateStore::new(templates_dir)),
33 extraction_engine: Arc::new(ExtractionEngine),
34 idempotency_store: Arc::new(MemoryIdempotencyStore::new()),
35 }
36 }
37
38 pub fn with_adapters(
40 template_store: Arc<dyn PluginTemplateStore>,
41 extraction_engine: Arc<dyn PluginExtractionPort>,
42 idempotency_store: Arc<dyn IdempotencyKeyStore>,
43 ) -> Self {
44 Self {
45 template_store,
46 extraction_engine,
47 idempotency_store,
48 }
49 }
50
51 fn tools_template_management() -> [Value; 3] {
52 [
53 json!({
54 "name": "plugin_create_template",
55 "description": "Create a new extraction template with the given name and optional description. Returns the template UUID.",
56 "inputSchema": {
57 "type": "object",
58 "properties": {
59 "name": { "type": "string", "description": "Template name (e.g., 'Product Listings')" },
60 "description": { "type": "string", "description": "Optional template description" },
61 "tags": {
62 "type": "array",
63 "items": { "type": "string" },
64 "description": "Optional tags for organization"
65 }
66 },
67 "required": ["name"]
68 }
69 }),
70 json!({
71 "name": "plugin_list_templates",
72 "description": "List all saved extraction templates with metadata.",
73 "inputSchema": { "type": "object", "properties": {} }
74 }),
75 json!({
76 "name": "plugin_delete_template",
77 "description": "Delete an extraction template permanently.",
78 "inputSchema": {
79 "type": "object",
80 "properties": {
81 "template_id": { "type": "string", "description": "UUID of the template to delete" }
82 },
83 "required": ["template_id"]
84 }
85 }),
86 ]
87 }
88
89 fn tools_extraction() -> [Value; 4] {
90 [
91 json!({
92 "name": "plugin_add_region",
93 "description": "Add an extraction region (named zone) to a template. A region is a named selector with transformations.",
94 "inputSchema": {
95 "type": "object",
96 "properties": {
97 "template_id": { "type": "string", "description": "UUID of the template" },
98 "region_name": { "type": "string", "description": "Unique name for this region (e.g., 'product_title')" },
99 "selector_css": { "type": "string", "description": "Optional CSS selector" },
100 "selector_xpath": { "type": "string", "description": "Optional XPath selector" },
101 "transformations": {
102 "type": "array",
103 "items": { "type": "string" },
104 "description": "Ordered transformations: 'Trim', 'Lowercase', 'Regex:pattern/replace', 'StripHtml', etc."
105 }
106 },
107 "required": ["template_id", "region_name"]
108 }
109 }),
110 json!({
111 "name": "plugin_apply_template",
112 "description": "Apply an extraction template to HTML content. Returns extracted data for each region.",
113 "inputSchema": {
114 "type": "object",
115 "properties": {
116 "template_id": { "type": "string", "description": "UUID of the template to apply" },
117 "html": { "type": "string", "description": "HTML content to extract from" },
118 "url": { "type": "string", "description": "Source URL (for logging/context)" }
119 },
120 "required": ["template_id", "html", "url"]
121 }
122 }),
123 json!({
124 "name": "plugin_get_template",
125 "description": "Retrieve a template's full configuration.",
126 "inputSchema": {
127 "type": "object",
128 "properties": {
129 "template_id": { "type": "string", "description": "UUID of the template" }
130 },
131 "required": ["template_id"]
132 }
133 }),
134 json!({
135 "name": "plugin_extract_batch",
136 "description": "Apply a template to extract multiple instances from a page (e.g., all products).",
137 "inputSchema": {
138 "type": "object",
139 "properties": {
140 "template_id": { "type": "string", "description": "UUID of the template" },
141 "html": { "type": "string", "description": "HTML content" },
142 "url": { "type": "string", "description": "Source URL" },
143 "root_selector": { "type": "string", "description": "CSS selector for parent containers to iterate over" }
144 },
145 "required": ["template_id", "html", "url", "root_selector"]
146 }
147 }),
148 ]
149 }
150
151 fn tools_inspection() -> [Value; 1] {
152 [json!({
153 "name": "plugin_inspect_selector",
154 "description": "Test if a CSS/XPath selector matches elements in HTML. Returns match count and preview.",
155 "inputSchema": {
156 "type": "object",
157 "properties": {
158 "html": { "type": "string", "description": "HTML to test against" },
159 "selector_css": { "type": "string", "description": "CSS selector to test" },
160 "selector_xpath": { "type": "string", "description": "XPath to test as fallback" }
161 },
162 "required": ["html"]
163 }
164 })]
165 }
166
167 pub fn tools_list(&self) -> Vec<Value> {
169 let mut tools = Vec::with_capacity(8);
170 tools.extend(Self::tools_template_management());
171 tools.extend(Self::tools_extraction());
172 tools.extend(Self::tools_inspection());
173 tools
174 }
175
176 pub async fn handle_tool_call(&self, name: &str, args: &Value) -> Value {
178 let result = match name {
179 "plugin_create_template" => self.tool_create_template(args).await,
180 "plugin_add_region" => self.tool_add_region(args).await,
181 "plugin_apply_template" => self.tool_apply_template(args).await,
182 "plugin_list_templates" => self.tool_list_templates(args).await,
183 "plugin_delete_template" => self.tool_delete_template(args).await,
184 "plugin_get_template" => self.tool_get_template(args).await,
185 "plugin_extract_batch" => self.tool_extract_batch(args).await,
186 "plugin_inspect_selector" => self.tool_inspect_selector(args).await,
187 _ => Err(PluginError::TemplateValidationError(format!(
188 "unknown tool: {name}"
189 ))),
190 };
191
192 match result {
193 Ok(data) => {
194 json!({ "content": [{ "type": "text", "text": serde_json::to_string(&data).unwrap_or_default() }] })
195 }
196 Err(e) => {
197 json!({ "content": [{ "type": "text", "text": format!("Error: {}", e) }], "isError": true })
198 }
199 }
200 }
201
202 async fn tool_create_template(&self, args: &Value) -> Result<Value> {
205 let name = args
206 .get("name")
207 .and_then(Value::as_str)
208 .ok_or_else(|| PluginError::TemplateValidationError("missing 'name'".to_string()))?;
209
210 let description = args
211 .get("description")
212 .and_then(Value::as_str)
213 .map(ToString::to_string);
214
215 let tags = args
216 .get("tags")
217 .and_then(Value::as_array)
218 .map(|a| {
219 a.iter()
220 .filter_map(|v| v.as_str().map(ToString::to_string))
221 .collect()
222 })
223 .unwrap_or_default();
224
225 let mut template = ExtractionTemplate::new(name);
226 if let Some(desc) = description {
227 template = template.with_description(desc);
228 }
229 template = template.with_tags(tags);
230
231 self.template_store.save(&template).await?;
232
233 Ok(json!({
234 "template_id": template.id.to_string(),
235 "name": template.name,
236 "created_at": template.metadata.created_at.to_rfc3339(),
237 }))
238 }
239
240 async fn tool_add_region(&self, args: &Value) -> Result<Value> {
241 let template_id = args
242 .get("template_id")
243 .and_then(Value::as_str)
244 .and_then(|s| Uuid::parse_str(s).ok())
245 .ok_or_else(|| {
246 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
247 })?;
248
249 let region_name = args
250 .get("region_name")
251 .and_then(Value::as_str)
252 .map(ToString::to_string)
253 .ok_or_else(|| {
254 PluginError::TemplateValidationError("missing 'region_name'".to_string())
255 })?;
256
257 let selector_css = args
258 .get("selector_css")
259 .and_then(Value::as_str)
260 .map(ToString::to_string);
261 let selector_xpath = args
262 .get("selector_xpath")
263 .and_then(Value::as_str)
264 .map(ToString::to_string);
265
266 let selector = match (selector_css, selector_xpath) {
267 (Some(css), Some(xpath)) => Selector::dual(css, xpath),
268 (Some(css), None) => Selector::css(css),
269 (None, Some(xpath)) => Selector::xpath(xpath),
270 (None, None) => {
271 return Err(PluginError::TemplateValidationError(
272 "must provide either selector_css or selector_xpath".to_string(),
273 ));
274 }
275 };
276
277 let mut template = self.template_store.get(&template_id).await?;
279
280 let mut transformations = Vec::new();
282 if let Some(arr) = args.get("transformations").and_then(Value::as_array) {
283 for (idx, v) in arr.iter().enumerate() {
284 let s = v.as_str().ok_or_else(|| {
285 PluginError::TemplateValidationError(format!(
286 "transformation at index {idx} must be a string"
287 ))
288 })?;
289 let transformation = parse_transformation(s).map_err(|_| {
290 PluginError::TemplateValidationError(format!(
291 "invalid transformation at index {idx}: '{s}'. Supported transformations: {SUPPORTED_TRANSFORMATIONS}"
292 ))
293 })?;
294 transformations.push(transformation);
295 }
296 }
297
298 let mut region = Region::new(®ion_name, selector, json!({"type": "string"}));
300 for t in transformations {
301 region = region.with_transformation(t);
302 }
303
304 template = template.with_region(region);
305 self.template_store.save(&template).await?;
306
307 Ok(json!({
308 "template_id": template.id.to_string(),
309 "region_name": region_name,
310 "regions_count": template.regions.len(),
311 }))
312 }
313
314 async fn tool_apply_template(&self, args: &Value) -> Result<Value> {
315 let template_id = args
316 .get("template_id")
317 .and_then(Value::as_str)
318 .and_then(|s| Uuid::parse_str(s).ok())
319 .ok_or_else(|| {
320 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
321 })?;
322
323 let html = args
324 .get("html")
325 .and_then(Value::as_str)
326 .map(ToString::to_string)
327 .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
328
329 let url = args
330 .get("url")
331 .and_then(Value::as_str)
332 .map(ToString::to_string)
333 .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
334
335 let template = self.template_store.get(&template_id).await?;
336 let request = ExtractionRequest::new(template, &url, &html);
337 let result = self.extraction_engine.execute(&request).await?;
338
339 Ok(json!({
340 "data": result.data,
341 "metadata": {
342 "regions_successful": result.metadata.region_status.values().filter(|s| s.success).count(),
343 "total_regions": result.metadata.region_status.len(),
344 "elapsed_ms": result.metadata.elapsed_ms,
345 }
346 }))
347 }
348
349 async fn tool_list_templates(&self, _args: &Value) -> Result<Value> {
350 let templates = self.template_store.list().await?;
351 let list: Vec<_> = templates
352 .iter()
353 .map(|t| {
354 json!({
355 "id": t.id.to_string(),
356 "name": &t.name,
357 "description": &t.description,
358 "regions": t.regions.len(),
359 "created_at": t.metadata.created_at.to_rfc3339(),
360 "usage_count": t.metadata.usage_count,
361 "tags": &t.metadata.tags,
362 })
363 })
364 .collect();
365
366 Ok(json!({
367 "count": list.len(),
368 "templates": list,
369 }))
370 }
371
372 async fn tool_delete_template(&self, args: &Value) -> Result<Value> {
373 let template_id = args
374 .get("template_id")
375 .and_then(Value::as_str)
376 .and_then(|s| Uuid::parse_str(s).ok())
377 .ok_or_else(|| {
378 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
379 })?;
380
381 self.template_store.delete(&template_id).await?;
382
383 Ok(json!({
384 "deleted": template_id.to_string(),
385 }))
386 }
387
388 async fn tool_get_template(&self, args: &Value) -> Result<Value> {
389 let template_id = args
390 .get("template_id")
391 .and_then(Value::as_str)
392 .and_then(|s| Uuid::parse_str(s).ok())
393 .ok_or_else(|| {
394 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
395 })?;
396
397 let template = self.template_store.get(&template_id).await?;
398
399 Ok(json!({
400 "id": template.id.to_string(),
401 "name": template.name,
402 "description": template.description,
403 "regions": template.regions.iter().map(|r| {
404 json!({
405 "name": r.name,
406 "selector": format!("{:?}", r.selector),
407 "transformations": r.transformations.iter().map(|t| format!("{t:?}")).collect::<Vec<_>>(),
408 })
409 }).collect::<Vec<_>>(),
410 "metadata": {
411 "created_at": template.metadata.created_at.to_rfc3339(),
412 "updated_at": template.metadata.updated_at.to_rfc3339(),
413 "usage_count": template.metadata.usage_count,
414 }
415 }))
416 }
417
418 async fn tool_extract_batch(&self, args: &Value) -> Result<Value> {
419 let template_id = args
420 .get("template_id")
421 .and_then(Value::as_str)
422 .and_then(|s| Uuid::parse_str(s).ok())
423 .ok_or_else(|| {
424 PluginError::TemplateValidationError("invalid 'template_id'".to_string())
425 })?;
426
427 let html = args
428 .get("html")
429 .and_then(Value::as_str)
430 .map(ToString::to_string)
431 .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
432
433 let url = args
434 .get("url")
435 .and_then(Value::as_str)
436 .map(ToString::to_string)
437 .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
438
439 let root_selector_str = args
440 .get("root_selector")
441 .and_then(Value::as_str)
442 .map(ToString::to_string)
443 .ok_or_else(|| {
444 PluginError::TemplateValidationError("missing 'root_selector'".to_string())
445 })?;
446
447 let root_selector =
449 ScraperSelector::parse(&root_selector_str).map_err(|_| PluginError::SelectorError {
450 selector: root_selector_str.clone(),
451 reason: "Failed to parse root_selector as CSS selector".to_string(),
452 })?;
453
454 let root_elements: Vec<String> = {
457 let document = Html::parse_document(&html);
458 document
459 .select(&root_selector)
460 .map(|elem| elem.inner_html())
461 .collect()
462 };
463
464 if root_elements.is_empty() {
465 return Err(PluginError::ExtractionError(format!(
466 "root_selector matched no elements: {root_selector_str}"
467 )));
468 }
469
470 let template = self.template_store.get(&template_id).await?;
472 let mut results = Vec::new();
473
474 for root_html in root_elements {
475 let request = ExtractionRequest::new(template.clone(), &url, &root_html);
476 match self.extraction_engine.execute(&request).await {
477 Ok(result) => {
478 results.push(json!({
479 "data": result.data,
480 "successful_regions": result.metadata.region_status.values().filter(|s| s.success).count(),
481 }));
482 }
483 Err(e) => {
484 results.push(json!({
486 "error": e.to_string(),
487 "successful_regions": 0,
488 }));
489 }
490 }
491 }
492
493 Ok(json!({
494 "root_selector": root_selector_str,
495 "results": results,
496 "total_matched": results.len(),
497 "successful": results.iter().filter(|r| r.get("data").is_some()).count(),
498 }))
499 }
500
501 async fn tool_inspect_selector(&self, args: &Value) -> Result<Value> {
502 let html = args
503 .get("html")
504 .and_then(Value::as_str)
505 .map(ToString::to_string)
506 .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
507
508 let selector_css = args
509 .get("selector_css")
510 .and_then(Value::as_str)
511 .map(ToString::to_string);
512 let selector_xpath = args
513 .get("selector_xpath")
514 .and_then(Value::as_str)
515 .map(ToString::to_string);
516
517 let selector = match (&selector_css, &selector_xpath) {
518 (Some(css), Some(xpath)) => Selector::dual(css, xpath),
519 (Some(css), None) => Selector::css(css),
520 (None, Some(xpath)) => Selector::xpath(xpath),
521 (None, None) => {
522 return Err(PluginError::TemplateValidationError(
523 "must provide either selector_css or selector_xpath".to_string(),
524 ));
525 }
526 };
527
528 selector.validate()?;
529
530 if let Some(css) = selector_css {
532 let (is_valid, count) = self
533 .extraction_engine
534 .validate_selector(&html, &css)
535 .await?;
536 Ok(json!({
537 "selector": css,
538 "selector_type": "css",
539 "valid": is_valid,
540 "match_count": count,
541 "preview": if count > 0 { "Selector matched elements" } else { "No elements matched" }
542 }))
543 } else if selector_xpath.is_some() {
544 Ok(json!({
546 "selector": selector_xpath,
547 "selector_type": "xpath",
548 "valid": true,
549 "note": "XPath selectors are not yet supported for validation. Please use CSS selectors to test matches."
550 }))
551 } else {
552 Err(PluginError::TemplateValidationError(
553 "No selector provided".to_string(),
554 ))
555 }
556 }
557}
558
559pub(crate) fn parse_transformation(s: &str) -> Result<Transformation> {
562 match s {
563 "Trim" => Ok(Transformation::Trim),
564 "Lowercase" => Ok(Transformation::Lowercase),
565 "Uppercase" => Ok(Transformation::Uppercase),
566 "RemoveWhitespace" => Ok(Transformation::RemoveWhitespace),
567 "NormalizeWhitespace" => Ok(Transformation::NormalizeWhitespace),
568 "StripHtml" => Ok(Transformation::StripHtml),
569 "DecodeHtml" => Ok(Transformation::DecodeHtml),
570 "ParseJson" => Ok(Transformation::ParseJson),
571 s if s.starts_with("RegexExtract:") => s
572 .strip_prefix("RegexExtract:")
573 .and_then(|rest| rest.rsplit_once('/'))
574 .map_or_else(
575 || {
576 Err(PluginError::TemplateValidationError(
577 "RegexExtract format: RegexExtract:pattern/group".to_string(),
578 ))
579 },
580 |(pattern, group_str)| {
581 let group = group_str.parse::<usize>().map_err(|_| {
582 PluginError::TemplateValidationError(
583 "RegexExtract group must be a positive integer".to_string(),
584 )
585 })?;
586 Ok(Transformation::RegexExtract {
587 pattern: pattern.to_string(),
588 group,
589 })
590 },
591 ),
592 s if s.starts_with("Coerce:") => s.strip_prefix("Coerce:").map_or_else(
593 || {
594 Err(PluginError::TemplateValidationError(
595 "Coerce format: Coerce:type".to_string(),
596 ))
597 },
598 |target_type| {
599 Ok(Transformation::Coerce {
600 target_type: target_type.to_string(),
601 })
602 },
603 ),
604 s if s.starts_with("Filter:") => s.strip_prefix("Filter:").map_or_else(
605 || {
606 Err(PluginError::TemplateValidationError(
607 "Filter format: Filter:pattern".to_string(),
608 ))
609 },
610 |pattern| {
611 Ok(Transformation::Filter {
612 pattern: pattern.to_string(),
613 })
614 },
615 ),
616 s if s.starts_with("Regex:") => s
617 .strip_prefix("Regex:")
618 .and_then(|rest| rest.split_once('/'))
619 .map_or_else(
620 || {
621 Err(PluginError::TemplateValidationError(
622 "Regex format: Regex:pattern/replacement".to_string(),
623 ))
624 },
625 |(pattern, replacement)| {
626 Ok(Transformation::Regex {
627 pattern: pattern.to_string(),
628 replacement: replacement.to_string(),
629 })
630 },
631 ),
632 _ => Err(PluginError::TemplateValidationError(format!(
633 "unknown transformation: {s}"
634 ))),
635 }
636}
637
638#[cfg(test)]
639mod tests {
640 use super::*;
641
642 #[test]
643 fn test_parse_transformation() {
644 assert!(parse_transformation("Trim").is_ok());
645 assert!(parse_transformation("Lowercase").is_ok());
646 assert!(parse_transformation("Regex:pattern/replace").is_ok());
647 assert!(parse_transformation("RegexExtract:price:(\\d+\\.\\d+)/1").is_ok());
648 assert!(parse_transformation("Coerce:number").is_ok());
649 assert!(parse_transformation("Filter:^ok$").is_ok());
650 assert!(parse_transformation("Invalid").is_err());
651 }
652}