Skip to main content

stygian_plugin/mcp/
server.rs

1//! MCP plugin server implementation
2//!
3//! Provides the core tool definitions and request handling for plugin extraction.
4
5use scraper::{Html, Selector as ScraperSelector};
6use serde_json::{Value, json};
7use std::sync::Arc;
8use uuid::Uuid;
9
10use crate::{
11    ExtractionRequest, PluginError, Result,
12    adapters::ExtractionEngine,
13    domain::{ExtractionTemplate, Region, Selector, Transformation},
14    ports::{IdempotencyKeyStore, PluginExtractionPort, PluginTemplateStore},
15    storage::{FileTemplateStore, MemoryIdempotencyStore},
16};
17
18const SUPPORTED_TRANSFORMATIONS: &str = "Trim, Lowercase, Uppercase, RemoveWhitespace, NormalizeWhitespace, StripHtml, DecodeHtml, ParseJson, Regex:pattern/replacement, RegexExtract:pattern/group, Coerce:type, Filter:pattern";
19
20/// MCP server providing plugin extraction tools
21#[allow(dead_code)]
22pub struct McpPluginServer {
23    template_store: Arc<dyn PluginTemplateStore>,
24    extraction_engine: Arc<dyn PluginExtractionPort>,
25    idempotency_store: Arc<dyn IdempotencyKeyStore>,
26}
27
28impl McpPluginServer {
29    /// Create a new plugin MCP server with file-based storage (development)
30    pub fn new_with_file_storage(templates_dir: std::path::PathBuf) -> Self {
31        Self {
32            template_store: Arc::new(FileTemplateStore::new(templates_dir)),
33            extraction_engine: Arc::new(ExtractionEngine),
34            idempotency_store: Arc::new(MemoryIdempotencyStore::new()),
35        }
36    }
37
38    /// Create with custom adapters
39    pub fn with_adapters(
40        template_store: Arc<dyn PluginTemplateStore>,
41        extraction_engine: Arc<dyn PluginExtractionPort>,
42        idempotency_store: Arc<dyn IdempotencyKeyStore>,
43    ) -> Self {
44        Self {
45            template_store,
46            extraction_engine,
47            idempotency_store,
48        }
49    }
50
51    fn tools_template_management() -> [Value; 3] {
52        [
53            json!({
54                "name": "plugin_create_template",
55                "description": "Create a new extraction template with the given name and optional description. Returns the template UUID.",
56                "inputSchema": {
57                    "type": "object",
58                    "properties": {
59                        "name": { "type": "string", "description": "Template name (e.g., 'Product Listings')" },
60                        "description": { "type": "string", "description": "Optional template description" },
61                        "tags": {
62                            "type": "array",
63                            "items": { "type": "string" },
64                            "description": "Optional tags for organization"
65                        }
66                    },
67                    "required": ["name"]
68                }
69            }),
70            json!({
71                "name": "plugin_list_templates",
72                "description": "List all saved extraction templates with metadata.",
73                "inputSchema": { "type": "object", "properties": {} }
74            }),
75            json!({
76                "name": "plugin_delete_template",
77                "description": "Delete an extraction template permanently.",
78                "inputSchema": {
79                    "type": "object",
80                    "properties": {
81                        "template_id": { "type": "string", "description": "UUID of the template to delete" }
82                    },
83                    "required": ["template_id"]
84                }
85            }),
86        ]
87    }
88
89    fn tools_extraction() -> [Value; 4] {
90        [
91            json!({
92                "name": "plugin_add_region",
93                "description": "Add an extraction region (named zone) to a template. A region is a named selector with transformations.",
94                "inputSchema": {
95                    "type": "object",
96                    "properties": {
97                        "template_id": { "type": "string", "description": "UUID of the template" },
98                        "region_name": { "type": "string", "description": "Unique name for this region (e.g., 'product_title')" },
99                        "selector_css": { "type": "string", "description": "Optional CSS selector" },
100                        "selector_xpath": { "type": "string", "description": "Optional XPath selector" },
101                        "transformations": {
102                            "type": "array",
103                            "items": { "type": "string" },
104                            "description": "Ordered transformations: 'Trim', 'Lowercase', 'Regex:pattern/replace', 'StripHtml', etc."
105                        }
106                    },
107                    "required": ["template_id", "region_name"]
108                }
109            }),
110            json!({
111                "name": "plugin_apply_template",
112                "description": "Apply an extraction template to HTML content. Returns extracted data for each region.",
113                "inputSchema": {
114                    "type": "object",
115                    "properties": {
116                        "template_id": { "type": "string", "description": "UUID of the template to apply" },
117                        "html": { "type": "string", "description": "HTML content to extract from" },
118                        "url": { "type": "string", "description": "Source URL (for logging/context)" }
119                    },
120                    "required": ["template_id", "html", "url"]
121                }
122            }),
123            json!({
124                "name": "plugin_get_template",
125                "description": "Retrieve a template's full configuration.",
126                "inputSchema": {
127                    "type": "object",
128                    "properties": {
129                        "template_id": { "type": "string", "description": "UUID of the template" }
130                    },
131                    "required": ["template_id"]
132                }
133            }),
134            json!({
135                "name": "plugin_extract_batch",
136                "description": "Apply a template to extract multiple instances from a page (e.g., all products).",
137                "inputSchema": {
138                    "type": "object",
139                    "properties": {
140                        "template_id": { "type": "string", "description": "UUID of the template" },
141                        "html": { "type": "string", "description": "HTML content" },
142                        "url": { "type": "string", "description": "Source URL" },
143                        "root_selector": { "type": "string", "description": "CSS selector for parent containers to iterate over" }
144                    },
145                    "required": ["template_id", "html", "url", "root_selector"]
146                }
147            }),
148        ]
149    }
150
151    fn tools_inspection() -> [Value; 1] {
152        [json!({
153            "name": "plugin_inspect_selector",
154            "description": "Test if a CSS/XPath selector matches elements in HTML. Returns match count and preview.",
155            "inputSchema": {
156                "type": "object",
157                "properties": {
158                    "html": { "type": "string", "description": "HTML to test against" },
159                    "selector_css": { "type": "string", "description": "CSS selector to test" },
160                    "selector_xpath": { "type": "string", "description": "XPath to test as fallback" }
161                },
162                "required": ["html"]
163            }
164        })]
165    }
166
167    /// Get the tool list for MCP protocol
168    pub fn tools_list(&self) -> Vec<Value> {
169        let mut tools = Vec::with_capacity(8);
170        tools.extend(Self::tools_template_management());
171        tools.extend(Self::tools_extraction());
172        tools.extend(Self::tools_inspection());
173        tools
174    }
175
176    /// Handle a tool call
177    pub async fn handle_tool_call(&self, name: &str, args: &Value) -> Value {
178        let result = match name {
179            "plugin_create_template" => self.tool_create_template(args).await,
180            "plugin_add_region" => self.tool_add_region(args).await,
181            "plugin_apply_template" => self.tool_apply_template(args).await,
182            "plugin_list_templates" => self.tool_list_templates(args).await,
183            "plugin_delete_template" => self.tool_delete_template(args).await,
184            "plugin_get_template" => self.tool_get_template(args).await,
185            "plugin_extract_batch" => self.tool_extract_batch(args).await,
186            "plugin_inspect_selector" => self.tool_inspect_selector(args).await,
187            _ => Err(PluginError::TemplateValidationError(format!(
188                "unknown tool: {name}"
189            ))),
190        };
191
192        match result {
193            Ok(data) => {
194                json!({ "content": [{ "type": "text", "text": serde_json::to_string(&data).unwrap_or_default() }] })
195            }
196            Err(e) => {
197                json!({ "content": [{ "type": "text", "text": format!("Error: {}", e) }], "isError": true })
198            }
199        }
200    }
201
202    // ── Tool implementations ───────────────────────────────────────────────
203
204    async fn tool_create_template(&self, args: &Value) -> Result<Value> {
205        let name = args
206            .get("name")
207            .and_then(Value::as_str)
208            .ok_or_else(|| PluginError::TemplateValidationError("missing 'name'".to_string()))?;
209
210        let description = args
211            .get("description")
212            .and_then(Value::as_str)
213            .map(ToString::to_string);
214
215        let tags = args
216            .get("tags")
217            .and_then(Value::as_array)
218            .map(|a| {
219                a.iter()
220                    .filter_map(|v| v.as_str().map(ToString::to_string))
221                    .collect()
222            })
223            .unwrap_or_default();
224
225        let mut template = ExtractionTemplate::new(name);
226        if let Some(desc) = description {
227            template = template.with_description(desc);
228        }
229        template = template.with_tags(tags);
230
231        self.template_store.save(&template).await?;
232
233        Ok(json!({
234            "template_id": template.id.to_string(),
235            "name": template.name,
236            "created_at": template.metadata.created_at.to_rfc3339(),
237        }))
238    }
239
240    async fn tool_add_region(&self, args: &Value) -> Result<Value> {
241        let template_id = args
242            .get("template_id")
243            .and_then(Value::as_str)
244            .and_then(|s| Uuid::parse_str(s).ok())
245            .ok_or_else(|| {
246                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
247            })?;
248
249        let region_name = args
250            .get("region_name")
251            .and_then(Value::as_str)
252            .map(ToString::to_string)
253            .ok_or_else(|| {
254                PluginError::TemplateValidationError("missing 'region_name'".to_string())
255            })?;
256
257        let selector_css = args
258            .get("selector_css")
259            .and_then(Value::as_str)
260            .map(ToString::to_string);
261        let selector_xpath = args
262            .get("selector_xpath")
263            .and_then(Value::as_str)
264            .map(ToString::to_string);
265
266        let selector = match (selector_css, selector_xpath) {
267            (Some(css), Some(xpath)) => Selector::dual(css, xpath),
268            (Some(css), None) => Selector::css(css),
269            (None, Some(xpath)) => Selector::xpath(xpath),
270            (None, None) => {
271                return Err(PluginError::TemplateValidationError(
272                    "must provide either selector_css or selector_xpath".to_string(),
273                ));
274            }
275        };
276
277        // Load template
278        let mut template = self.template_store.get(&template_id).await?;
279
280        // Parse transformations - validate all entries and fail on first error
281        let mut transformations = Vec::new();
282        if let Some(arr) = args.get("transformations").and_then(Value::as_array) {
283            for (idx, v) in arr.iter().enumerate() {
284                let s = v.as_str().ok_or_else(|| {
285                    PluginError::TemplateValidationError(format!(
286                        "transformation at index {idx} must be a string"
287                    ))
288                })?;
289                let transformation = parse_transformation(s).map_err(|_| {
290                    PluginError::TemplateValidationError(format!(
291                        "invalid transformation at index {idx}: '{s}'. Supported transformations: {SUPPORTED_TRANSFORMATIONS}"
292                    ))
293                })?;
294                transformations.push(transformation);
295            }
296        }
297
298        // Create region
299        let mut region = Region::new(&region_name, selector, json!({"type": "string"}));
300        for t in transformations {
301            region = region.with_transformation(t);
302        }
303
304        template = template.with_region(region);
305        self.template_store.save(&template).await?;
306
307        Ok(json!({
308            "template_id": template.id.to_string(),
309            "region_name": region_name,
310            "regions_count": template.regions.len(),
311        }))
312    }
313
314    async fn tool_apply_template(&self, args: &Value) -> Result<Value> {
315        let template_id = args
316            .get("template_id")
317            .and_then(Value::as_str)
318            .and_then(|s| Uuid::parse_str(s).ok())
319            .ok_or_else(|| {
320                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
321            })?;
322
323        let html = args
324            .get("html")
325            .and_then(Value::as_str)
326            .map(ToString::to_string)
327            .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
328
329        let url = args
330            .get("url")
331            .and_then(Value::as_str)
332            .map(ToString::to_string)
333            .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
334
335        let template = self.template_store.get(&template_id).await?;
336        let request = ExtractionRequest::new(template, &url, &html);
337        let result = self.extraction_engine.execute(&request).await?;
338
339        Ok(json!({
340            "data": result.data,
341            "metadata": {
342                "regions_successful": result.metadata.region_status.values().filter(|s| s.success).count(),
343                "total_regions": result.metadata.region_status.len(),
344                "elapsed_ms": result.metadata.elapsed_ms,
345            }
346        }))
347    }
348
349    async fn tool_list_templates(&self, _args: &Value) -> Result<Value> {
350        let templates = self.template_store.list().await?;
351        let list: Vec<_> = templates
352            .iter()
353            .map(|t| {
354                json!({
355                    "id": t.id.to_string(),
356                    "name": &t.name,
357                    "description": &t.description,
358                    "regions": t.regions.len(),
359                    "created_at": t.metadata.created_at.to_rfc3339(),
360                    "usage_count": t.metadata.usage_count,
361                    "tags": &t.metadata.tags,
362                })
363            })
364            .collect();
365
366        Ok(json!({
367            "count": list.len(),
368            "templates": list,
369        }))
370    }
371
372    async fn tool_delete_template(&self, args: &Value) -> Result<Value> {
373        let template_id = args
374            .get("template_id")
375            .and_then(Value::as_str)
376            .and_then(|s| Uuid::parse_str(s).ok())
377            .ok_or_else(|| {
378                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
379            })?;
380
381        self.template_store.delete(&template_id).await?;
382
383        Ok(json!({
384            "deleted": template_id.to_string(),
385        }))
386    }
387
388    async fn tool_get_template(&self, args: &Value) -> Result<Value> {
389        let template_id = args
390            .get("template_id")
391            .and_then(Value::as_str)
392            .and_then(|s| Uuid::parse_str(s).ok())
393            .ok_or_else(|| {
394                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
395            })?;
396
397        let template = self.template_store.get(&template_id).await?;
398
399        Ok(json!({
400            "id": template.id.to_string(),
401            "name": template.name,
402            "description": template.description,
403            "regions": template.regions.iter().map(|r| {
404                json!({
405                    "name": r.name,
406                    "selector": format!("{:?}", r.selector),
407                    "transformations": r.transformations.iter().map(|t| format!("{t:?}")).collect::<Vec<_>>(),
408                })
409            }).collect::<Vec<_>>(),
410            "metadata": {
411                "created_at": template.metadata.created_at.to_rfc3339(),
412                "updated_at": template.metadata.updated_at.to_rfc3339(),
413                "usage_count": template.metadata.usage_count,
414            }
415        }))
416    }
417
418    async fn tool_extract_batch(&self, args: &Value) -> Result<Value> {
419        let template_id = args
420            .get("template_id")
421            .and_then(Value::as_str)
422            .and_then(|s| Uuid::parse_str(s).ok())
423            .ok_or_else(|| {
424                PluginError::TemplateValidationError("invalid 'template_id'".to_string())
425            })?;
426
427        let html = args
428            .get("html")
429            .and_then(Value::as_str)
430            .map(ToString::to_string)
431            .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
432
433        let url = args
434            .get("url")
435            .and_then(Value::as_str)
436            .map(ToString::to_string)
437            .ok_or_else(|| PluginError::TemplateValidationError("missing 'url'".to_string()))?;
438
439        let root_selector_str = args
440            .get("root_selector")
441            .and_then(Value::as_str)
442            .map(ToString::to_string)
443            .ok_or_else(|| {
444                PluginError::TemplateValidationError("missing 'root_selector'".to_string())
445            })?;
446
447        // Parse root selector as CSS (XPath not supported for batch extraction)
448        let root_selector =
449            ScraperSelector::parse(&root_selector_str).map_err(|_| PluginError::SelectorError {
450                selector: root_selector_str.clone(),
451                reason: "Failed to parse root_selector as CSS selector".to_string(),
452            })?;
453
454        // Parse HTML and find all root containers.
455        // Keep this in a separate scope so non-Send scraper internals are dropped before await.
456        let root_elements: Vec<String> = {
457            let document = Html::parse_document(&html);
458            document
459                .select(&root_selector)
460                .map(|elem| elem.inner_html())
461                .collect()
462        };
463
464        if root_elements.is_empty() {
465            return Err(PluginError::ExtractionError(format!(
466                "root_selector matched no elements: {root_selector_str}"
467            )));
468        }
469
470        // Extract data from each root container
471        let template = self.template_store.get(&template_id).await?;
472        let mut results = Vec::new();
473
474        for root_html in root_elements {
475            let request = ExtractionRequest::new(template.clone(), &url, &root_html);
476            match self.extraction_engine.execute(&request).await {
477                Ok(result) => {
478                    results.push(json!({
479                        "data": result.data,
480                        "successful_regions": result.metadata.region_status.values().filter(|s| s.success).count(),
481                    }));
482                }
483                Err(e) => {
484                    // Continue with partial results on error
485                    results.push(json!({
486                        "error": e.to_string(),
487                        "successful_regions": 0,
488                    }));
489                }
490            }
491        }
492
493        Ok(json!({
494            "root_selector": root_selector_str,
495            "results": results,
496            "total_matched": results.len(),
497            "successful": results.iter().filter(|r| r.get("data").is_some()).count(),
498        }))
499    }
500
501    async fn tool_inspect_selector(&self, args: &Value) -> Result<Value> {
502        let html = args
503            .get("html")
504            .and_then(Value::as_str)
505            .map(ToString::to_string)
506            .ok_or_else(|| PluginError::TemplateValidationError("missing 'html'".to_string()))?;
507
508        let selector_css = args
509            .get("selector_css")
510            .and_then(Value::as_str)
511            .map(ToString::to_string);
512        let selector_xpath = args
513            .get("selector_xpath")
514            .and_then(Value::as_str)
515            .map(ToString::to_string);
516
517        let selector = match (&selector_css, &selector_xpath) {
518            (Some(css), Some(xpath)) => Selector::dual(css, xpath),
519            (Some(css), None) => Selector::css(css),
520            (None, Some(xpath)) => Selector::xpath(xpath),
521            (None, None) => {
522                return Err(PluginError::TemplateValidationError(
523                    "must provide either selector_css or selector_xpath".to_string(),
524                ));
525            }
526        };
527
528        selector.validate()?;
529
530        // Use the CSS selector for validation/counting since XPath is not yet supported
531        if let Some(css) = selector_css {
532            let (is_valid, count) = self
533                .extraction_engine
534                .validate_selector(&html, &css)
535                .await?;
536            Ok(json!({
537                "selector": css,
538                "selector_type": "css",
539                "valid": is_valid,
540                "match_count": count,
541                "preview": if count > 0 { "Selector matched elements" } else { "No elements matched" }
542            }))
543        } else if selector_xpath.is_some() {
544            // XPath validation not yet implemented
545            Ok(json!({
546                "selector": selector_xpath,
547                "selector_type": "xpath",
548                "valid": true,
549                "note": "XPath selectors are not yet supported for validation. Please use CSS selectors to test matches."
550            }))
551        } else {
552            Err(PluginError::TemplateValidationError(
553                "No selector provided".to_string(),
554            ))
555        }
556    }
557}
558
559// ─── Helpers ───────────────────────────────────────────────────────────────
560
561pub(crate) fn parse_transformation(s: &str) -> Result<Transformation> {
562    match s {
563        "Trim" => Ok(Transformation::Trim),
564        "Lowercase" => Ok(Transformation::Lowercase),
565        "Uppercase" => Ok(Transformation::Uppercase),
566        "RemoveWhitespace" => Ok(Transformation::RemoveWhitespace),
567        "NormalizeWhitespace" => Ok(Transformation::NormalizeWhitespace),
568        "StripHtml" => Ok(Transformation::StripHtml),
569        "DecodeHtml" => Ok(Transformation::DecodeHtml),
570        "ParseJson" => Ok(Transformation::ParseJson),
571        s if s.starts_with("RegexExtract:") => s
572            .strip_prefix("RegexExtract:")
573            .and_then(|rest| rest.rsplit_once('/'))
574            .map_or_else(
575                || {
576                    Err(PluginError::TemplateValidationError(
577                        "RegexExtract format: RegexExtract:pattern/group".to_string(),
578                    ))
579                },
580                |(pattern, group_str)| {
581                    let group = group_str.parse::<usize>().map_err(|_| {
582                        PluginError::TemplateValidationError(
583                            "RegexExtract group must be a positive integer".to_string(),
584                        )
585                    })?;
586                    Ok(Transformation::RegexExtract {
587                        pattern: pattern.to_string(),
588                        group,
589                    })
590                },
591            ),
592        s if s.starts_with("Coerce:") => s.strip_prefix("Coerce:").map_or_else(
593            || {
594                Err(PluginError::TemplateValidationError(
595                    "Coerce format: Coerce:type".to_string(),
596                ))
597            },
598            |target_type| {
599                Ok(Transformation::Coerce {
600                    target_type: target_type.to_string(),
601                })
602            },
603        ),
604        s if s.starts_with("Filter:") => s.strip_prefix("Filter:").map_or_else(
605            || {
606                Err(PluginError::TemplateValidationError(
607                    "Filter format: Filter:pattern".to_string(),
608                ))
609            },
610            |pattern| {
611                Ok(Transformation::Filter {
612                    pattern: pattern.to_string(),
613                })
614            },
615        ),
616        s if s.starts_with("Regex:") => s
617            .strip_prefix("Regex:")
618            .and_then(|rest| rest.split_once('/'))
619            .map_or_else(
620                || {
621                    Err(PluginError::TemplateValidationError(
622                        "Regex format: Regex:pattern/replacement".to_string(),
623                    ))
624                },
625                |(pattern, replacement)| {
626                    Ok(Transformation::Regex {
627                        pattern: pattern.to_string(),
628                        replacement: replacement.to_string(),
629                    })
630                },
631            ),
632        _ => Err(PluginError::TemplateValidationError(format!(
633            "unknown transformation: {s}"
634        ))),
635    }
636}
637
638#[cfg(test)]
639mod tests {
640    use super::*;
641
642    #[test]
643    fn test_parse_transformation() {
644        assert!(parse_transformation("Trim").is_ok());
645        assert!(parse_transformation("Lowercase").is_ok());
646        assert!(parse_transformation("Regex:pattern/replace").is_ok());
647        assert!(parse_transformation("RegexExtract:price:(\\d+\\.\\d+)/1").is_ok());
648        assert!(parse_transformation("Coerce:number").is_ok());
649        assert!(parse_transformation("Filter:^ok$").is_ok());
650        assert!(parse_transformation("Invalid").is_err());
651    }
652}