Skip to main content

stygian_plugin/mcp/
server.rs

1//! MCP plugin server implementation
2//!
3//! Provides the core tool definitions and request handling for plugin extraction.
4
5use scraper::{Html, Selector as ScraperSelector};
6use serde_json::{Value, json};
7use std::sync::Arc;
8use uuid::Uuid;
9
10use crate::{
11    ExtractionRequest, PluginError, Result,
12    adapters::ExtractionEngine,
13    domain::{ExtractionTemplate, Region, Selector, Transformation},
14    ports::{IdempotencyKeyStore, PluginExtractionPort, PluginTemplateStore},
15    storage::{FileTemplateStore, MemoryIdempotencyStore},
16};
17
18const SUPPORTED_TRANSFORMATIONS: &str = "Trim, Lowercase, Uppercase, RemoveWhitespace, NormalizeWhitespace, StripHtml, DecodeHtml, ParseJson, Regex:pattern/replacement, RegexExtract:pattern/group, Coerce:type, Filter:pattern";
19
20/// MCP server providing plugin extraction tools
21#[allow(dead_code)]
22pub struct McpPluginServer {
23    template_store: Arc<dyn PluginTemplateStore>,
24    extraction_engine: Arc<dyn PluginExtractionPort>,
25    idempotency_store: Arc<dyn IdempotencyKeyStore>,
26}
27
28impl McpPluginServer {
29    /// Create a new plugin MCP server with file-based storage (development)
30    pub fn new_with_file_storage(templates_dir: std::path::PathBuf) -> Self {
31        Self {
32            template_store: Arc::new(FileTemplateStore::new(templates_dir)),
33            extraction_engine: Arc::new(ExtractionEngine),
34            idempotency_store: Arc::new(MemoryIdempotencyStore::new()),
35        }
36    }
37
38    /// Create with custom adapters
39    pub fn with_adapters(
40        template_store: Arc<dyn PluginTemplateStore>,
41        extraction_engine: Arc<dyn PluginExtractionPort>,
42        idempotency_store: Arc<dyn IdempotencyKeyStore>,
43    ) -> Self {
44        Self {
45            template_store,
46            extraction_engine,
47            idempotency_store,
48        }
49    }
50
51    fn tools_template_management() -> [Value; 3] {
52        [
53            json!({
54                "name": "plugin_create_template",
55                "description": "Create a new extraction template with the given name and optional description. Returns the template UUID.",
56                "inputSchema": {
57                    "type": "object",
58                    "properties": {
59                        "name": { "type": "string", "description": "Template name (e.g., 'Product Listings')" },
60                        "description": { "type": "string", "description": "Optional template description" },
61                        "tags": {
62                            "type": "array",
63                            "items": { "type": "string" },
64                            "description": "Optional tags for organization"
65                        }
66                    },
67                    "required": ["name"]
68                }
69            }),
70            json!({
71                "name": "plugin_list_templates",
72                "description": "List all saved extraction templates with metadata.",
73                "inputSchema": { "type": "object", "properties": {} }
74            }),
75            json!({
76                "name": "plugin_delete_template",
77                "description": "Delete an extraction template permanently.",
78                "inputSchema": {
79                    "type": "object",
80                    "properties": {
81                        "template_id": { "type": "string", "description": "UUID of the template to delete" }
82                    },
83                    "required": ["template_id"]
84                }
85            }),
86        ]
87    }
88
89    fn tools_extraction() -> [Value; 4] {
90        [
91            json!({
92                "name": "plugin_add_region",
93                "description": "Add an extraction region (named zone) to a template. A region is a named selector with transformations.",
94                "inputSchema": {
95                    "type": "object",
96                    "properties": {
97                        "template_id": { "type": "string", "description": "UUID of the template" },
98                        "region_name": { "type": "string", "description": "Unique name for this region (e.g., 'product_title')" },
99                        "selector_css": { "type": "string", "description": "Optional CSS selector" },
100                        "selector_xpath": { "type": "string", "description": "Optional XPath selector" },
101                        "transformations": {
102                            "type": "array",
103                            "items": { "type": "string" },
104                            "description": "Ordered transformations: 'Trim', 'Lowercase', 'Regex:pattern/replace', 'StripHtml', etc."
105                        }
106                    },
107                    "required": ["template_id", "region_name"]
108                }
109            }),
110            json!({
111                "name": "plugin_apply_template",
112                "description": "Apply an extraction template to HTML content. Returns extracted data for each region.",
113                "inputSchema": {
114                    "type": "object",
115                    "properties": {
116                        "template_id": { "type": "string", "description": "UUID of the template to apply" },
117                        "html": { "type": "string", "description": "HTML content to extract from" },
118                        "url": { "type": "string", "description": "Source URL (for logging/context)" },
119                        "debug": { "type": "boolean", "description": "Include per-region selector diagnostics and root HTML snippet." }
120                    },
121                    "required": ["template_id", "html", "url"]
122                }
123            }),
124            json!({
125                "name": "plugin_get_template",
126                "description": "Retrieve a template's full configuration.",
127                "inputSchema": {
128                    "type": "object",
129                    "properties": {
130                        "template_id": { "type": "string", "description": "UUID of the template" }
131                    },
132                    "required": ["template_id"]
133                }
134            }),
135            json!({
136                "name": "plugin_extract_batch",
137                "description": "Apply a template to extract multiple instances from a page (e.g., all products).",
138                "inputSchema": {
139                    "type": "object",
140                    "properties": {
141                        "template_id": { "type": "string", "description": "UUID of the template" },
142                        "html": { "type": "string", "description": "HTML content" },
143                        "url": { "type": "string", "description": "Source URL" },
144                        "root_selector": { "type": "string", "description": "CSS selector for parent containers to iterate over" }
145                    },
146                    "required": ["template_id", "html", "url", "root_selector"]
147                }
148            }),
149        ]
150    }
151
152    fn tools_inspection() -> [Value; 1] {
153        [json!({
154            "name": "plugin_inspect_selector",
155            "description": "Test if a CSS/XPath selector matches elements in HTML. Returns match count and preview.",
156            "inputSchema": {
157                "type": "object",
158                "properties": {
159                    "html": { "type": "string", "description": "HTML to test against" },
160                    "selector_css": { "type": "string", "description": "CSS selector to test" },
161                    "selector_xpath": { "type": "string", "description": "XPath to test as fallback" }
162                },
163                "required": ["html"]
164            }
165        })]
166    }
167
168    /// Get the tool list for MCP protocol
169    pub fn tools_list(&self) -> Vec<Value> {
170        let mut tools = Vec::with_capacity(8);
171        tools.extend(Self::tools_template_management());
172        tools.extend(Self::tools_extraction());
173        tools.extend(Self::tools_inspection());
174        tools
175    }
176
177    /// Handle a tool call
178    pub async fn handle_tool_call(&self, name: &str, args: &Value) -> Value {
179        let result = match name {
180            "plugin_create_template" => self.tool_create_template(args).await,
181            "plugin_add_region" => self.tool_add_region(args).await,
182            "plugin_apply_template" => self.tool_apply_template(args).await,
183            "plugin_list_templates" => self.tool_list_templates(args).await,
184            "plugin_delete_template" => self.tool_delete_template(args).await,
185            "plugin_get_template" => self.tool_get_template(args).await,
186            "plugin_extract_batch" => self.tool_extract_batch(args).await,
187            "plugin_inspect_selector" => self.tool_inspect_selector(args).await,
188            _ => Err(PluginError::TemplateValidationError(format!(
189                "unknown tool: {name}"
190            ))),
191        };
192
193        match result {
194            Ok(data) => {
195                json!({ "content": [{ "type": "text", "text": serde_json::to_string(&data).unwrap_or_default() }] })
196            }
197            Err(e) => {
198                json!({ "content": [{ "type": "text", "text": format!("Error: {}", e) }], "isError": true })
199            }
200        }
201    }
202
203    // ── Tool implementations ───────────────────────────────────────────────
204
205    async fn tool_create_template(&self, args: &Value) -> Result<Value> {
206        let name = args
207            .get("name")
208            .and_then(Value::as_str)
209            .ok_or_else(|| PluginError::TemplateValidationError("missing 'name'".to_string()))?;
210
211        let description = args
212            .get("description")
213            .and_then(Value::as_str)
214            .map(ToString::to_string);
215
216        let tags = args
217            .get("tags")
218            .and_then(Value::as_array)
219            .map(|a| {
220                a.iter()
221                    .filter_map(|v| v.as_str().map(ToString::to_string))
222                    .collect()
223            })
224            .unwrap_or_default();
225
226        let mut template = ExtractionTemplate::new(name);
227        if let Some(desc) = description {
228            template = template.with_description(desc);
229        }
230        template = template.with_tags(tags);
231
232        self.template_store.save(&template).await?;
233
234        Ok(json!({
235            "template_id": template.id.to_string(),
236            "name": template.name,
237            "created_at": template.metadata.created_at.to_rfc3339(),
238        }))
239    }
240
241    async fn tool_add_region(&self, args: &Value) -> Result<Value> {
242        let template_id = args
243            .get("template_id")
244            .and_then(Value::as_str)
245            .and_then(|s| Uuid::parse_str(s).ok())
246            .ok_or_else(|| {
247                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
248            })?;
249
250        let region_name = args
251            .get("region_name")
252            .and_then(Value::as_str)
253            .map(ToString::to_string)
254            .ok_or_else(|| {
255                PluginError::TemplateValidationError("missing 'region_name'".to_string())
256            })?;
257
258        let selector_css = args
259            .get("selector_css")
260            .and_then(Value::as_str)
261            .map(ToString::to_string);
262        let selector_xpath = args
263            .get("selector_xpath")
264            .and_then(Value::as_str)
265            .map(ToString::to_string);
266
267        let selector = match (selector_css, selector_xpath) {
268            (Some(css), Some(xpath)) => Selector::dual(css, xpath),
269            (Some(css), None) => Selector::css(css),
270            (None, Some(xpath)) => Selector::xpath(xpath),
271            (None, None) => {
272                return Err(PluginError::TemplateValidationError(
273                    "must provide either selector_css or selector_xpath".to_string(),
274                ));
275            }
276        };
277
278        // Load template
279        let mut template = self.template_store.get(&template_id).await?;
280
281        // Parse transformations - validate all entries and fail on first error
282        let mut transformations = Vec::new();
283        if let Some(arr) = args.get("transformations").and_then(Value::as_array) {
284            for (idx, v) in arr.iter().enumerate() {
285                let s = v.as_str().ok_or_else(|| {
286                    PluginError::TemplateValidationError(format!(
287                        "transformation at index {idx} must be a string"
288                    ))
289                })?;
290                let transformation = parse_transformation(s).map_err(|_| {
291                    PluginError::TemplateValidationError(format!(
292                        "invalid transformation at index {idx}: '{s}'. Supported transformations: {SUPPORTED_TRANSFORMATIONS}"
293                    ))
294                })?;
295                transformations.push(transformation);
296            }
297        }
298
299        // Create region
300        let mut region = Region::new(&region_name, selector, json!({"type": "string"}));
301        for t in transformations {
302            region = region.with_transformation(t);
303        }
304
305        template = template.with_region(region);
306        self.template_store.save(&template).await?;
307
308        Ok(json!({
309            "template_id": template.id.to_string(),
310            "region_name": region_name,
311            "regions_count": template.regions.len(),
312        }))
313    }
314
315    async fn tool_apply_template(&self, args: &Value) -> Result<Value> {
316        let template_id = args
317            .get("template_id")
318            .and_then(Value::as_str)
319            .and_then(|s| Uuid::parse_str(s).ok())
320            .ok_or_else(|| {
321                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
322            })?;
323
324        let html = args
325            .get("html")
326            .and_then(Value::as_str)
327            .map(ToString::to_string)
328            .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
329
330        let url = args
331            .get("url")
332            .and_then(Value::as_str)
333            .map(ToString::to_string)
334            .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
335        let debug = args.get("debug").and_then(Value::as_bool).unwrap_or(false);
336
337        let template = self.template_store.get(&template_id).await?;
338        let request = ExtractionRequest::new(template, &url, &html);
339        let result = self.extraction_engine.execute(&request).await?;
340        let debug_payload = debug.then(|| ExtractionEngine::diagnose(&request, "document"));
341
342        Ok(json!({
343            "data": result.data,
344            "metadata": {
345                "regions_successful": result.metadata.region_status.values().filter(|s| s.success).count(),
346                "total_regions": result.metadata.region_status.len(),
347                "elapsed_ms": result.metadata.elapsed_ms,
348                "region_status": result.metadata.region_status,
349                "errors": result.metadata.errors,
350            },
351            "debug": debug_payload,
352        }))
353    }
354
355    async fn tool_list_templates(&self, _args: &Value) -> Result<Value> {
356        let templates = self.template_store.list().await?;
357        let list: Vec<_> = templates
358            .iter()
359            .map(|t| {
360                json!({
361                    "id": t.id.to_string(),
362                    "name": &t.name,
363                    "description": &t.description,
364                    "regions": t.regions.len(),
365                    "created_at": t.metadata.created_at.to_rfc3339(),
366                    "usage_count": t.metadata.usage_count,
367                    "tags": &t.metadata.tags,
368                })
369            })
370            .collect();
371
372        Ok(json!({
373            "count": list.len(),
374            "templates": list,
375        }))
376    }
377
378    async fn tool_delete_template(&self, args: &Value) -> Result<Value> {
379        let template_id = args
380            .get("template_id")
381            .and_then(Value::as_str)
382            .and_then(|s| Uuid::parse_str(s).ok())
383            .ok_or_else(|| {
384                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
385            })?;
386
387        self.template_store.delete(&template_id).await?;
388
389        Ok(json!({
390            "deleted": template_id.to_string(),
391        }))
392    }
393
394    async fn tool_get_template(&self, args: &Value) -> Result<Value> {
395        let template_id = args
396            .get("template_id")
397            .and_then(Value::as_str)
398            .and_then(|s| Uuid::parse_str(s).ok())
399            .ok_or_else(|| {
400                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
401            })?;
402
403        let template = self.template_store.get(&template_id).await?;
404
405        Ok(json!({
406            "id": template.id.to_string(),
407            "name": template.name,
408            "description": template.description,
409            "regions": template.regions.iter().map(|r| {
410                json!({
411                    "name": r.name,
412                    "selector": format!("{:?}", r.selector),
413                    "transformations": r.transformations.iter().map(|t| format!("{t:?}")).collect::<Vec<_>>(),
414                })
415            }).collect::<Vec<_>>(),
416            "metadata": {
417                "created_at": template.metadata.created_at.to_rfc3339(),
418                "updated_at": template.metadata.updated_at.to_rfc3339(),
419                "usage_count": template.metadata.usage_count,
420            }
421        }))
422    }
423
424    async fn tool_extract_batch(&self, args: &Value) -> Result<Value> {
425        let template_id = args
426            .get("template_id")
427            .and_then(Value::as_str)
428            .and_then(|s| Uuid::parse_str(s).ok())
429            .ok_or_else(|| {
430                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
431            })?;
432
433        let html = args
434            .get("html")
435            .and_then(Value::as_str)
436            .map(ToString::to_string)
437            .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
438
439        let url = args
440            .get("url")
441            .and_then(Value::as_str)
442            .map(ToString::to_string)
443            .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
444
445        let root_selector_str = args
446            .get("root_selector")
447            .and_then(Value::as_str)
448            .map(ToString::to_string)
449            .ok_or_else(|| {
450                PluginError::TemplateValidationError("missing 'root_selector'".to_string())
451            })?;
452        let debug = args.get("debug").and_then(Value::as_bool).unwrap_or(false);
453
454        // Parse root selector as CSS (XPath not supported for batch extraction)
455        let root_selector =
456            ScraperSelector::parse(&root_selector_str).map_err(|_| PluginError::SelectorError {
457                selector: root_selector_str.clone(),
458                reason: "Failed to parse root_selector as CSS selector".to_string(),
459            })?;
460
461        // Parse HTML and find all root containers.
462        // Keep this in a separate scope so non-Send scraper internals are dropped before await.
463        let root_elements: Vec<String> = {
464            let document = Html::parse_document(&html);
465            document
466                .select(&root_selector)
467                .map(|elem| elem.html())
468                .collect()
469        };
470
471        if root_elements.is_empty() {
472            return Err(PluginError::ExtractionError(format!(
473                "root_selector matched no elements: {root_selector_str}"
474            )));
475        }
476
477        let first_root_html = debug.then(|| {
478            root_elements.first().map(|root| {
479                let mut truncated = String::new();
480                for (index, ch) in root.chars().enumerate() {
481                    if index >= 2_000 {
482                        truncated.push_str("...");
483                        break;
484                    }
485                    truncated.push(ch);
486                }
487                truncated
488            })
489        });
490
491        // Extract data from each root container
492        let template = self.template_store.get(&template_id).await?;
493        let mut results = Vec::new();
494
495        for root_html in root_elements {
496            let request = ExtractionRequest::new(template.clone(), &url, &root_html);
497            match self.extraction_engine.execute(&request).await {
498                Ok(result) => {
499                    results.push(json!({
500                        "data": result.data,
501                        "successful_regions": result.metadata.region_status.values().filter(|s| s.success).count(),
502                    }));
503                }
504                Err(e) => {
505                    // Continue with partial results on error
506                    results.push(json!({
507                        "error": e.to_string(),
508                        "successful_regions": 0,
509                    }));
510                }
511            }
512        }
513
514        Ok(json!({
515            "root_selector": root_selector_str,
516            "results": results,
517            "total_matched": results.len(),
518            "successful": results.iter().filter(|r| r.get("data").is_some()).count(),
519            "debug": debug.then(|| json!({
520                "evaluation_scope": "root_fragment",
521                "first_root_html": first_root_html,
522            })),
523        }))
524    }
525
526    async fn tool_inspect_selector(&self, args: &Value) -> Result<Value> {
527        let html = args
528            .get("html")
529            .and_then(Value::as_str)
530            .map(ToString::to_string)
531            .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
532
533        let selector_css = args
534            .get("selector_css")
535            .and_then(Value::as_str)
536            .map(ToString::to_string);
537        let selector_xpath = args
538            .get("selector_xpath")
539            .and_then(Value::as_str)
540            .map(ToString::to_string);
541
542        let selector = match (&selector_css, &selector_xpath) {
543            (Some(css), Some(xpath)) => Selector::dual(css, xpath),
544            (Some(css), None) => Selector::css(css),
545            (None, Some(xpath)) => Selector::xpath(xpath),
546            (None, None) => {
547                return Err(PluginError::TemplateValidationError(
548                    "must provide either selector_css or selector_xpath".to_string(),
549                ));
550            }
551        };
552
553        selector.validate()?;
554
555        // Use the CSS selector for validation/counting since XPath is not yet supported
556        if let Some(css) = selector_css {
557            let (is_valid, count) = self
558                .extraction_engine
559                .validate_selector(&html, &css)
560                .await?;
561            Ok(json!({
562                "selector": css,
563                "selector_type": "css",
564                "valid": is_valid,
565                "match_count": count,
566                "preview": if count > 0 { "Selector matched elements" } else { "No elements matched" }
567            }))
568        } else if selector_xpath.is_some() {
569            // XPath validation not yet implemented
570            Ok(json!({
571                "selector": selector_xpath,
572                "selector_type": "xpath",
573                "valid": true,
574                "note": "XPath selectors are not yet supported for validation. Please use CSS selectors to test matches."
575            }))
576        } else {
577            Err(PluginError::TemplateValidationError(
578                "No selector provided".to_string(),
579            ))
580        }
581    }
582}
583
584// ─── Helpers ───────────────────────────────────────────────────────────────
585
586pub(crate) fn parse_transformation(s: &str) -> Result<Transformation> {
587    match s {
588        "Trim" => Ok(Transformation::Trim),
589        "Lowercase" => Ok(Transformation::Lowercase),
590        "Uppercase" => Ok(Transformation::Uppercase),
591        "RemoveWhitespace" => Ok(Transformation::RemoveWhitespace),
592        "NormalizeWhitespace" => Ok(Transformation::NormalizeWhitespace),
593        "StripHtml" => Ok(Transformation::StripHtml),
594        "DecodeHtml" => Ok(Transformation::DecodeHtml),
595        "ParseJson" => Ok(Transformation::ParseJson),
596        s if s.starts_with("RegexExtract:") => s
597            .strip_prefix("RegexExtract:")
598            .and_then(|rest| rest.rsplit_once('/'))
599            .map_or_else(
600                || {
601                    Err(PluginError::TemplateValidationError(
602                        "RegexExtract format: RegexExtract:pattern/group".to_string(),
603                    ))
604                },
605                |(pattern, group_str)| {
606                    let group = group_str.parse::<usize>().map_err(|_| {
607                        PluginError::TemplateValidationError(
608                            "RegexExtract group must be a positive integer".to_string(),
609                        )
610                    })?;
611                    Ok(Transformation::RegexExtract {
612                        pattern: pattern.to_string(),
613                        group,
614                    })
615                },
616            ),
617        s if s.starts_with("Coerce:") => s.strip_prefix("Coerce:").map_or_else(
618            || {
619                Err(PluginError::TemplateValidationError(
620                    "Coerce format: Coerce:type".to_string(),
621                ))
622            },
623            |target_type| {
624                Ok(Transformation::Coerce {
625                    target_type: target_type.to_string(),
626                })
627            },
628        ),
629        s if s.starts_with("Filter:") => s.strip_prefix("Filter:").map_or_else(
630            || {
631                Err(PluginError::TemplateValidationError(
632                    "Filter format: Filter:pattern".to_string(),
633                ))
634            },
635            |pattern| {
636                Ok(Transformation::Filter {
637                    pattern: pattern.to_string(),
638                })
639            },
640        ),
641        s if s.starts_with("Regex:") => s
642            .strip_prefix("Regex:")
643            .and_then(|rest| rest.split_once('/'))
644            .map_or_else(
645                || {
646                    Err(PluginError::TemplateValidationError(
647                        "Regex format: Regex:pattern/replacement".to_string(),
648                    ))
649                },
650                |(pattern, replacement)| {
651                    Ok(Transformation::Regex {
652                        pattern: pattern.to_string(),
653                        replacement: replacement.to_string(),
654                    })
655                },
656            ),
657        _ => Err(PluginError::TemplateValidationError(format!(
658            "unknown transformation: {s}"
659        ))),
660    }
661}
662
663#[cfg(test)]
664mod tests {
665    use super::*;
666
667    #[test]
668    fn test_parse_transformation() {
669        assert!(parse_transformation("Trim").is_ok());
670        assert!(parse_transformation("Lowercase").is_ok());
671        assert!(parse_transformation("Regex:pattern/replace").is_ok());
672        assert!(parse_transformation("RegexExtract:price:(\\d+\\.\\d+)/1").is_ok());
673        assert!(parse_transformation("Coerce:number").is_ok());
674        assert!(parse_transformation("Filter:^ok$").is_ok());
675        assert!(parse_transformation("Invalid").is_err());
676    }
677}