Skip to main content

stygian_plugin/mcp/
server.rs

1//! MCP plugin server implementation
2//!
3//! Provides the core tool definitions and request handling for plugin extraction.
4
5use scraper::{Html, Selector as ScraperSelector};
6use serde_json::{Value, json};
7use std::sync::Arc;
8use uuid::Uuid;
9
10use crate::{
11    ExtractionRequest, PluginError, Result,
12    adapters::ExtractionEngine,
13    domain::{ExtractionTemplate, Region, Selector, Transformation},
14    ports::{IdempotencyKeyStore, PluginExtractionPort, PluginTemplateStore},
15    storage::{FileTemplateStore, MemoryIdempotencyStore},
16};
17
18const SUPPORTED_TRANSFORMATIONS: &str = "Trim, Lowercase, Uppercase, RemoveWhitespace, NormalizeWhitespace, StripHtml, DecodeHtml, ParseJson, Regex:pattern/replacement, RegexExtract:pattern/group, Coerce:type, Filter:pattern";
19
20/// MCP server providing plugin extraction tools
21#[allow(dead_code)]
22pub struct McpPluginServer {
23    template_store: Arc<dyn PluginTemplateStore>,
24    extraction_engine: Arc<dyn PluginExtractionPort>,
25    idempotency_store: Arc<dyn IdempotencyKeyStore>,
26}
27
28impl McpPluginServer {
29    /// Create a new plugin MCP server with file-based storage (development)
30    #[must_use]
31    pub fn new_with_file_storage(templates_dir: std::path::PathBuf) -> Self {
32        Self {
33            template_store: Arc::new(FileTemplateStore::new(templates_dir)),
34            extraction_engine: Arc::new(ExtractionEngine),
35            idempotency_store: Arc::new(MemoryIdempotencyStore::new()),
36        }
37    }
38
39    /// Create with custom adapters
40    pub fn with_adapters(
41        template_store: Arc<dyn PluginTemplateStore>,
42        extraction_engine: Arc<dyn PluginExtractionPort>,
43        idempotency_store: Arc<dyn IdempotencyKeyStore>,
44    ) -> Self {
45        Self {
46            template_store,
47            extraction_engine,
48            idempotency_store,
49        }
50    }
51
52    fn tools_template_management() -> [Value; 3] {
53        [
54            json!({
55                "name": "plugin_create_template",
56                "description": "Create a new extraction template with the given name and optional description. Returns the template UUID.",
57                "inputSchema": {
58                    "type": "object",
59                    "properties": {
60                        "name": { "type": "string", "description": "Template name (e.g., 'Product Listings')" },
61                        "description": { "type": "string", "description": "Optional template description" },
62                        "tags": {
63                            "type": "array",
64                            "items": { "type": "string" },
65                            "description": "Optional tags for organization"
66                        }
67                    },
68                    "required": ["name"]
69                }
70            }),
71            json!({
72                "name": "plugin_list_templates",
73                "description": "List all saved extraction templates with metadata.",
74                "inputSchema": { "type": "object", "properties": {} }
75            }),
76            json!({
77                "name": "plugin_delete_template",
78                "description": "Delete an extraction template permanently.",
79                "inputSchema": {
80                    "type": "object",
81                    "properties": {
82                        "template_id": { "type": "string", "description": "UUID of the template to delete" }
83                    },
84                    "required": ["template_id"]
85                }
86            }),
87        ]
88    }
89
90    fn tools_extraction() -> [Value; 4] {
91        [
92            json!({
93                "name": "plugin_add_region",
94                "description": "Add an extraction region (named zone) to a template. A region is a named selector with transformations.",
95                "inputSchema": {
96                    "type": "object",
97                    "properties": {
98                        "template_id": { "type": "string", "description": "UUID of the template" },
99                        "region_name": { "type": "string", "description": "Unique name for this region (e.g., 'product_title')" },
100                        "selector_css": { "type": "string", "description": "Optional CSS selector" },
101                        "selector_xpath": { "type": "string", "description": "Optional XPath selector" },
102                        "transformations": {
103                            "type": "array",
104                            "items": { "type": "string" },
105                            "description": "Ordered transformations: 'Trim', 'Lowercase', 'Regex:pattern/replace', 'StripHtml', etc."
106                        }
107                    },
108                    "required": ["template_id", "region_name"]
109                }
110            }),
111            json!({
112                "name": "plugin_apply_template",
113                "description": "Apply an extraction template to HTML content. Returns extracted data for each region.",
114                "inputSchema": {
115                    "type": "object",
116                    "properties": {
117                        "template_id": { "type": "string", "description": "UUID of the template to apply" },
118                        "html": { "type": "string", "description": "HTML content to extract from" },
119                        "url": { "type": "string", "description": "Source URL (for logging/context)" },
120                        "debug": { "type": "boolean", "description": "Include per-region selector diagnostics and root HTML snippet." }
121                    },
122                    "required": ["template_id", "html", "url"]
123                }
124            }),
125            json!({
126                "name": "plugin_get_template",
127                "description": "Retrieve a template's full configuration.",
128                "inputSchema": {
129                    "type": "object",
130                    "properties": {
131                        "template_id": { "type": "string", "description": "UUID of the template" }
132                    },
133                    "required": ["template_id"]
134                }
135            }),
136            json!({
137                "name": "plugin_extract_batch",
138                "description": "Apply a template to extract multiple instances from a page (e.g., all products).",
139                "inputSchema": {
140                    "type": "object",
141                    "properties": {
142                        "template_id": { "type": "string", "description": "UUID of the template" },
143                        "html": { "type": "string", "description": "HTML content" },
144                        "url": { "type": "string", "description": "Source URL" },
145                        "root_selector": { "type": "string", "description": "CSS selector for parent containers to iterate over" }
146                    },
147                    "required": ["template_id", "html", "url", "root_selector"]
148                }
149            }),
150        ]
151    }
152
153    fn tools_inspection() -> [Value; 1] {
154        [json!({
155            "name": "plugin_inspect_selector",
156            "description": "Test if a CSS/XPath selector matches elements in HTML. Returns match count and preview.",
157            "inputSchema": {
158                "type": "object",
159                "properties": {
160                    "html": { "type": "string", "description": "HTML to test against" },
161                    "selector_css": { "type": "string", "description": "CSS selector to test" },
162                    "selector_xpath": { "type": "string", "description": "XPath to test as fallback" }
163                },
164                "required": ["html"]
165            }
166        })]
167    }
168
169    /// Get the tool list for MCP protocol
170    #[must_use]
171    pub fn tools_list(&self) -> Vec<Value> {
172        let mut tools = Vec::with_capacity(8);
173        tools.extend(Self::tools_template_management());
174        tools.extend(Self::tools_extraction());
175        tools.extend(Self::tools_inspection());
176        tools
177    }
178
179    /// Handle a tool call
180    pub async fn handle_tool_call(&self, name: &str, args: &Value) -> Value {
181        let result = match name {
182            "plugin_create_template" => self.tool_create_template(args).await,
183            "plugin_add_region" => self.tool_add_region(args).await,
184            "plugin_apply_template" => self.tool_apply_template(args).await,
185            "plugin_list_templates" => self.tool_list_templates(args).await,
186            "plugin_delete_template" => self.tool_delete_template(args).await,
187            "plugin_get_template" => self.tool_get_template(args).await,
188            "plugin_extract_batch" => self.tool_extract_batch(args).await,
189            "plugin_inspect_selector" => self.tool_inspect_selector(args).await,
190            _ => Err(PluginError::TemplateValidationError(format!(
191                "unknown tool: {name}"
192            ))),
193        };
194
195        match result {
196            Ok(data) => {
197                json!({ "content": [{ "type": "text", "text": serde_json::to_string(&data).unwrap_or_default() }] })
198            }
199            Err(e) => {
200                json!({ "content": [{ "type": "text", "text": format!("Error: {}", e) }], "isError": true })
201            }
202        }
203    }
204
205    // ── Tool implementations ───────────────────────────────────────────────
206
207    async fn tool_create_template(&self, args: &Value) -> Result<Value> {
208        let name = args
209            .get("name")
210            .and_then(Value::as_str)
211            .ok_or_else(|| PluginError::TemplateValidationError("missing 'name'".to_string()))?;
212
213        let description = args
214            .get("description")
215            .and_then(Value::as_str)
216            .map(ToString::to_string);
217
218        let tags = args
219            .get("tags")
220            .and_then(Value::as_array)
221            .map(|a| {
222                a.iter()
223                    .filter_map(|v| v.as_str().map(ToString::to_string))
224                    .collect()
225            })
226            .unwrap_or_default();
227
228        let mut template = ExtractionTemplate::new(name);
229        if let Some(desc) = description {
230            template = template.with_description(desc);
231        }
232        template = template.with_tags(tags);
233
234        self.template_store.save(&template).await?;
235
236        Ok(json!({
237            "template_id": template.id.to_string(),
238            "name": template.name,
239            "created_at": template.metadata.created_at.to_rfc3339(),
240        }))
241    }
242
243    async fn tool_add_region(&self, args: &Value) -> Result<Value> {
244        let template_id = args
245            .get("template_id")
246            .and_then(Value::as_str)
247            .and_then(|s| Uuid::parse_str(s).ok())
248            .ok_or_else(|| {
249                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
250            })?;
251
252        let region_name = args
253            .get("region_name")
254            .and_then(Value::as_str)
255            .map(ToString::to_string)
256            .ok_or_else(|| {
257                PluginError::TemplateValidationError("missing 'region_name'".to_string())
258            })?;
259
260        let selector_css = args
261            .get("selector_css")
262            .and_then(Value::as_str)
263            .map(ToString::to_string);
264        let selector_xpath = args
265            .get("selector_xpath")
266            .and_then(Value::as_str)
267            .map(ToString::to_string);
268
269        let selector = match (selector_css, selector_xpath) {
270            (Some(css), Some(xpath)) => Selector::dual(css, xpath),
271            (Some(css), None) => Selector::css(css),
272            (None, Some(xpath)) => Selector::xpath(xpath),
273            (None, None) => {
274                return Err(PluginError::TemplateValidationError(
275                    "must provide either selector_css or selector_xpath".to_string(),
276                ));
277            }
278        };
279
280        // Load template
281        let mut template = self.template_store.get(&template_id).await?;
282
283        // Parse transformations - validate all entries and fail on first error
284        let mut transformations = Vec::new();
285        if let Some(arr) = args.get("transformations").and_then(Value::as_array) {
286            for (idx, v) in arr.iter().enumerate() {
287                let s = v.as_str().ok_or_else(|| {
288                    PluginError::TemplateValidationError(format!(
289                        "transformation at index {idx} must be a string"
290                    ))
291                })?;
292                let transformation = parse_transformation(s).map_err(|_| {
293                    PluginError::TemplateValidationError(format!(
294                        "invalid transformation at index {idx}: '{s}'. Supported transformations: {SUPPORTED_TRANSFORMATIONS}"
295                    ))
296                })?;
297                transformations.push(transformation);
298            }
299        }
300
301        // Create region
302        let mut region = Region::new(&region_name, selector, json!({"type": "string"}));
303        for t in transformations {
304            region = region.with_transformation(t);
305        }
306
307        template = template.with_region(region);
308        self.template_store.save(&template).await?;
309
310        Ok(json!({
311            "template_id": template.id.to_string(),
312            "region_name": region_name,
313            "regions_count": template.regions.len(),
314        }))
315    }
316
317    async fn tool_apply_template(&self, args: &Value) -> Result<Value> {
318        let template_id = args
319            .get("template_id")
320            .and_then(Value::as_str)
321            .and_then(|s| Uuid::parse_str(s).ok())
322            .ok_or_else(|| {
323                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
324            })?;
325
326        let html = args
327            .get("html")
328            .and_then(Value::as_str)
329            .map(ToString::to_string)
330            .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
331
332        let url = args
333            .get("url")
334            .and_then(Value::as_str)
335            .map(ToString::to_string)
336            .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
337        let debug = args.get("debug").and_then(Value::as_bool).unwrap_or(false);
338
339        let template = self.template_store.get(&template_id).await?;
340        let request = ExtractionRequest::new(template, &url, &html);
341        let result = self.extraction_engine.execute(&request).await?;
342        let debug_payload = debug.then(|| ExtractionEngine::diagnose(&request, "document"));
343
344        Ok(json!({
345            "data": result.data,
346            "metadata": {
347                "regions_successful": result.metadata.region_status.values().filter(|s| s.success).count(),
348                "total_regions": result.metadata.region_status.len(),
349                "elapsed_ms": result.metadata.elapsed_ms,
350                "region_status": result.metadata.region_status,
351                "errors": result.metadata.errors,
352            },
353            "debug": debug_payload,
354        }))
355    }
356
357    async fn tool_list_templates(&self, _args: &Value) -> Result<Value> {
358        let templates = self.template_store.list().await?;
359        let list: Vec<_> = templates
360            .iter()
361            .map(|t| {
362                json!({
363                    "id": t.id.to_string(),
364                    "name": &t.name,
365                    "description": &t.description,
366                    "regions": t.regions.len(),
367                    "created_at": t.metadata.created_at.to_rfc3339(),
368                    "usage_count": t.metadata.usage_count,
369                    "tags": &t.metadata.tags,
370                })
371            })
372            .collect();
373
374        Ok(json!({
375            "count": list.len(),
376            "templates": list,
377        }))
378    }
379
380    async fn tool_delete_template(&self, args: &Value) -> Result<Value> {
381        let template_id = args
382            .get("template_id")
383            .and_then(Value::as_str)
384            .and_then(|s| Uuid::parse_str(s).ok())
385            .ok_or_else(|| {
386                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
387            })?;
388
389        self.template_store.delete(&template_id).await?;
390
391        Ok(json!({
392            "deleted": template_id.to_string(),
393        }))
394    }
395
396    async fn tool_get_template(&self, args: &Value) -> Result<Value> {
397        let template_id = args
398            .get("template_id")
399            .and_then(Value::as_str)
400            .and_then(|s| Uuid::parse_str(s).ok())
401            .ok_or_else(|| {
402                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
403            })?;
404
405        let template = self.template_store.get(&template_id).await?;
406
407        Ok(json!({
408            "id": template.id.to_string(),
409            "name": template.name,
410            "description": template.description,
411            "regions": template.regions.iter().map(|r| {
412                json!({
413                    "name": r.name,
414                    "selector": format!("{:?}", r.selector),
415                    "transformations": r.transformations.iter().map(|t| format!("{t:?}")).collect::<Vec<_>>(),
416                })
417            }).collect::<Vec<_>>(),
418            "metadata": {
419                "created_at": template.metadata.created_at.to_rfc3339(),
420                "updated_at": template.metadata.updated_at.to_rfc3339(),
421                "usage_count": template.metadata.usage_count,
422            }
423        }))
424    }
425
426    async fn tool_extract_batch(&self, args: &Value) -> Result<Value> {
427        let template_id = args
428            .get("template_id")
429            .and_then(Value::as_str)
430            .and_then(|s| Uuid::parse_str(s).ok())
431            .ok_or_else(|| {
432                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
433            })?;
434
435        let html = args
436            .get("html")
437            .and_then(Value::as_str)
438            .map(ToString::to_string)
439            .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
440
441        let url = args
442            .get("url")
443            .and_then(Value::as_str)
444            .map(ToString::to_string)
445            .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
446
447        let root_selector_str = args
448            .get("root_selector")
449            .and_then(Value::as_str)
450            .map(ToString::to_string)
451            .ok_or_else(|| {
452                PluginError::TemplateValidationError("missing 'root_selector'".to_string())
453            })?;
454        let debug = args.get("debug").and_then(Value::as_bool).unwrap_or(false);
455
456        // Parse root selector as CSS (XPath not supported for batch extraction)
457        let root_selector =
458            ScraperSelector::parse(&root_selector_str).map_err(|_| PluginError::SelectorError {
459                selector: root_selector_str.clone(),
460                reason: "Failed to parse root_selector as CSS selector".to_string(),
461            })?;
462
463        // Parse HTML and find all root containers.
464        // Keep this in a separate scope so non-Send scraper internals are dropped before await.
465        let root_elements: Vec<String> = {
466            let document = Html::parse_document(&html);
467            document
468                .select(&root_selector)
469                .map(|elem| elem.html())
470                .collect()
471        };
472
473        if root_elements.is_empty() {
474            return Err(PluginError::ExtractionError(format!(
475                "root_selector matched no elements: {root_selector_str}"
476            )));
477        }
478
479        let first_root_html = debug.then(|| {
480            root_elements.first().map(|root| {
481                let mut truncated = String::new();
482                for (index, ch) in root.chars().enumerate() {
483                    if index >= 2_000 {
484                        truncated.push_str("...");
485                        break;
486                    }
487                    truncated.push(ch);
488                }
489                truncated
490            })
491        });
492
493        // Extract data from each root container
494        let template = self.template_store.get(&template_id).await?;
495        let mut results = Vec::new();
496
497        for root_html in root_elements {
498            let request = ExtractionRequest::new(template.clone(), &url, &root_html);
499            match self.extraction_engine.execute(&request).await {
500                Ok(result) => {
501                    results.push(json!({
502                        "data": result.data,
503                        "successful_regions": result.metadata.region_status.values().filter(|s| s.success).count(),
504                    }));
505                }
506                Err(e) => {
507                    // Continue with partial results on error
508                    results.push(json!({
509                        "error": e.to_string(),
510                        "successful_regions": 0,
511                    }));
512                }
513            }
514        }
515
516        Ok(json!({
517            "root_selector": root_selector_str,
518            "results": results,
519            "total_matched": results.len(),
520            "successful": results.iter().filter(|r| r.get("data").is_some()).count(),
521            "debug": debug.then(|| json!({
522                "evaluation_scope": "root_fragment",
523                "first_root_html": first_root_html,
524            })),
525        }))
526    }
527
528    async fn tool_inspect_selector(&self, args: &Value) -> Result<Value> {
529        let html = args
530            .get("html")
531            .and_then(Value::as_str)
532            .map(ToString::to_string)
533            .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
534
535        let selector_css = args
536            .get("selector_css")
537            .and_then(Value::as_str)
538            .map(ToString::to_string);
539        let selector_xpath = args
540            .get("selector_xpath")
541            .and_then(Value::as_str)
542            .map(ToString::to_string);
543
544        let selector = match (&selector_css, &selector_xpath) {
545            (Some(css), Some(xpath)) => Selector::dual(css, xpath),
546            (Some(css), None) => Selector::css(css),
547            (None, Some(xpath)) => Selector::xpath(xpath),
548            (None, None) => {
549                return Err(PluginError::TemplateValidationError(
550                    "must provide either selector_css or selector_xpath".to_string(),
551                ));
552            }
553        };
554
555        selector.validate()?;
556
557        // Use the CSS selector for validation/counting since XPath is not yet supported
558        if let Some(css) = selector_css {
559            let (is_valid, count) = self
560                .extraction_engine
561                .validate_selector(&html, &css)
562                .await?;
563            Ok(json!({
564                "selector": css,
565                "selector_type": "css",
566                "valid": is_valid,
567                "match_count": count,
568                "preview": if count > 0 { "Selector matched elements" } else { "No elements matched" }
569            }))
570        } else if selector_xpath.is_some() {
571            // XPath validation not yet implemented
572            Ok(json!({
573                "selector": selector_xpath,
574                "selector_type": "xpath",
575                "valid": true,
576                "note": "XPath selectors are not yet supported for validation. Please use CSS selectors to test matches."
577            }))
578        } else {
579            Err(PluginError::TemplateValidationError(
580                "No selector provided".to_string(),
581            ))
582        }
583    }
584}
585
586// ─── Helpers ───────────────────────────────────────────────────────────────
587
588pub(crate) fn parse_transformation(s: &str) -> Result<Transformation> {
589    match s {
590        "Trim" => Ok(Transformation::Trim),
591        "Lowercase" => Ok(Transformation::Lowercase),
592        "Uppercase" => Ok(Transformation::Uppercase),
593        "RemoveWhitespace" => Ok(Transformation::RemoveWhitespace),
594        "NormalizeWhitespace" => Ok(Transformation::NormalizeWhitespace),
595        "StripHtml" => Ok(Transformation::StripHtml),
596        "DecodeHtml" => Ok(Transformation::DecodeHtml),
597        "ParseJson" => Ok(Transformation::ParseJson),
598        s if s.starts_with("RegexExtract:") => s
599            .strip_prefix("RegexExtract:")
600            .and_then(|rest| rest.rsplit_once('/'))
601            .map_or_else(
602                || {
603                    Err(PluginError::TemplateValidationError(
604                        "RegexExtract format: RegexExtract:pattern/group".to_string(),
605                    ))
606                },
607                |(pattern, group_str)| {
608                    let group = group_str.parse::<usize>().map_err(|_| {
609                        PluginError::TemplateValidationError(
610                            "RegexExtract group must be a positive integer".to_string(),
611                        )
612                    })?;
613                    Ok(Transformation::RegexExtract {
614                        pattern: pattern.to_string(),
615                        group,
616                    })
617                },
618            ),
619        s if s.starts_with("Coerce:") => s.strip_prefix("Coerce:").map_or_else(
620            || {
621                Err(PluginError::TemplateValidationError(
622                    "Coerce format: Coerce:type".to_string(),
623                ))
624            },
625            |target_type| {
626                Ok(Transformation::Coerce {
627                    target_type: target_type.to_string(),
628                })
629            },
630        ),
631        s if s.starts_with("Filter:") => s.strip_prefix("Filter:").map_or_else(
632            || {
633                Err(PluginError::TemplateValidationError(
634                    "Filter format: Filter:pattern".to_string(),
635                ))
636            },
637            |pattern| {
638                Ok(Transformation::Filter {
639                    pattern: pattern.to_string(),
640                })
641            },
642        ),
643        s if s.starts_with("Regex:") => s
644            .strip_prefix("Regex:")
645            .and_then(|rest| rest.split_once('/'))
646            .map_or_else(
647                || {
648                    Err(PluginError::TemplateValidationError(
649                        "Regex format: Regex:pattern/replacement".to_string(),
650                    ))
651                },
652                |(pattern, replacement)| {
653                    Ok(Transformation::Regex {
654                        pattern: pattern.to_string(),
655                        replacement: replacement.to_string(),
656                    })
657                },
658            ),
659        _ => Err(PluginError::TemplateValidationError(format!(
660            "unknown transformation: {s}"
661        ))),
662    }
663}
664
665#[cfg(test)]
666mod tests {
667    use super::*;
668
669    #[test]
670    fn test_parse_transformation() {
671        assert!(parse_transformation("Trim").is_ok());
672        assert!(parse_transformation("Lowercase").is_ok());
673        assert!(parse_transformation("Regex:pattern/replace").is_ok());
674        assert!(parse_transformation("RegexExtract:price:(\\d+\\.\\d+)/1").is_ok());
675        assert!(parse_transformation("Coerce:number").is_ok());
676        assert!(parse_transformation("Filter:^ok$").is_ok());
677        assert!(parse_transformation("Invalid").is_err());
678    }
679}