Skip to main content

orbok_extract/
plugin.rs

1//! Plugin extractor interface (RFC-028 §7).
2//!
3//! This module defines the security-boundary types for external
4//! extractor plugins. In v0.8, plugin *loading* is not yet implemented
5//! (dynamic linking is deferred), but the interface is defined so that:
6//!
7//! 1. Built-in extractors can be registered with the same manifest.
8//! 2. The security contract is formalized before any loading code exists.
9//!
10//! ## Security model (RFC-028 §6)
11//!
12//! - A plugin extractor receives only a `ValidatedPath` — it cannot
13//!   request arbitrary filesystem access. The PathGuard boundary
14//!   (RFC-003 §8) applies before any plugin receives a path.
15//! - Plugin failures are isolated: a panic in a plugin extractor must
16//!   not crash the orbok process (RFC-005 §13).
17//! - User consent is required before a non-built-in plugin is used;
18//!   the manifest provides the metadata for that consent dialog.
19//! - Plugin logging must follow NFR-014: no document contents logged.
20//!
21//! ## Dynamic loading (future)
22//!
23//! When RFC-028 is fully activated, plugin `.so`/`.dll` files will be
24//! located via the `PluginRegistry`. Until then, `PluginRegistry` only
25//! holds the built-in extractors.
26
27use crate::types::DocumentExtractor;
28
29/// Metadata attached to every extractor plugin for display and consent.
30#[derive(Debug, Clone)]
31pub struct PluginManifest {
32    /// Stable identifier (e.g. `"excel-xlsx-v1"`). Must be unique.
33    pub plugin_id: &'static str,
34    /// Human-readable display name.
35    pub display_name: &'static str,
36    /// Comma-separated list of handled file extensions.
37    pub extensions: &'static [&'static str],
38    /// Author name.
39    pub author: &'static str,
40    /// License (user sees this in the consent dialog).
41    pub license: &'static str,
42    /// Whether this plugin is built-in (no user consent required) or
43    /// external (user must explicitly allow).
44    pub builtin: bool,
45    /// Privacy statement: what the plugin does NOT do.
46    pub privacy_note: &'static str,
47}
48
49/// A plugin extractor: manifest metadata + the extraction implementation.
50pub struct PluginExtractor {
51    pub manifest: PluginManifest,
52    pub extractor: Box<dyn DocumentExtractor>,
53}
54
55impl PluginExtractor {
56    /// Wrap a built-in extractor with its manifest.
57    pub fn builtin(manifest: PluginManifest, extractor: Box<dyn DocumentExtractor>) -> Self {
58        debug_assert!(manifest.builtin, "use PluginExtractor::external for non-built-in plugins");
59        Self { manifest, extractor }
60    }
61}
62
63/// The plugin registry (RFC-028 §8).
64///
65/// In v0.8, only built-in plugins are registered. Dynamic loading is
66/// gated behind `RFC-028` being fully activated.
67pub struct PluginRegistry {
68    plugins: Vec<PluginExtractor>,
69}
70
71impl Default for PluginRegistry {
72    fn default() -> Self {
73        use crate::docx::DocxExtractor;
74        use crate::html::HtmlExtractor;
75        use crate::markdown::MarkdownExtractor;
76        use crate::pdf::PdfExtractor;
77        use crate::text::PlainTextExtractor;
78        let mut reg = Self { plugins: Vec::new() };
79        reg.register_builtin(
80            PluginManifest {
81                plugin_id: "docx-v1",
82                display_name: "Microsoft Word (DOCX)",
83                extensions: &["docx"],
84                author: "orbok built-in",
85                license: "Apache-2.0",
86                builtin: true,
87                privacy_note: "Does not transmit content externally.",
88            },
89            Box::new(DocxExtractor),
90        );
91        reg.register_builtin(
92            PluginManifest {
93                plugin_id: "html-v1",
94                display_name: "HTML",
95                extensions: &["html", "htm"],
96                author: "orbok built-in",
97                license: "Apache-2.0",
98                builtin: true,
99                privacy_note: "Does not transmit content externally.",
100            },
101            Box::new(HtmlExtractor),
102        );
103        reg.register_builtin(
104            PluginManifest {
105                plugin_id: "markdown-v1",
106                display_name: "Markdown",
107                extensions: &["md", "markdown"],
108                author: "orbok built-in",
109                license: "Apache-2.0",
110                builtin: true,
111                privacy_note: "Does not transmit content externally.",
112            },
113            Box::new(MarkdownExtractor),
114        );
115        reg.register_builtin(
116            PluginManifest {
117                plugin_id: "plain-text-v1",
118                display_name: "Plain Text",
119                extensions: &["txt", "log", "rs", "py", "js", "ts", "go", "sql", "toml",
120                              "yaml", "yml", "json", "xml", "css", "html", "htm"],
121                author: "orbok built-in",
122                license: "Apache-2.0",
123                builtin: true,
124                privacy_note: "Does not transmit content externally.",
125            },
126            Box::new(PlainTextExtractor),
127        );
128        reg.register_builtin(
129            PluginManifest {
130                plugin_id: "pdf-lopdf-v1",
131                display_name: "PDF (lopdf)",
132                extensions: &["pdf"],
133                author: "orbok built-in",
134                license: "Apache-2.0",
135                builtin: true,
136                privacy_note: "Extracts text locally. Does not transmit content externally.",
137            },
138            Box::new(PdfExtractor),
139        );
140        reg
141    }
142}
143
144impl PluginRegistry {
145    fn register_builtin(&mut self, manifest: PluginManifest, extractor: Box<dyn DocumentExtractor>) {
146        self.plugins.push(PluginExtractor::builtin(manifest, extractor));
147    }
148
149    /// Find the plugin that handles the given extension.
150    pub fn find_for_extension(&self, ext: &str) -> Option<&PluginExtractor> {
151        let ext_lower = ext.to_ascii_lowercase();
152        self.plugins
153            .iter()
154            .find(|p| p.manifest.extensions.contains(&ext_lower.as_str()))
155    }
156
157    /// All registered plugin manifests (for the Models/Settings view).
158    pub fn manifests(&self) -> Vec<&PluginManifest> {
159        self.plugins.iter().map(|p| &p.manifest).collect()
160    }
161
162    /// Number of registered plugins.
163    pub fn len(&self) -> usize {
164        self.plugins.len()
165    }
166}