Skip to main content

orbok_extract/
plugin.rs

1//! Plugin extractor interface (RFC-028 §7).
2//!
3//! This module defines the security-boundary types for external
4//! extractor plugins. In v0.8, plugin *loading* is not yet implemented
5//! (dynamic linking is deferred), but the interface is defined so that:
6//!
7//! 1. Built-in extractors can be registered with the same manifest.
8//! 2. The security contract is formalized before any loading code exists.
9//!
10//! ## Security model (RFC-028 §6)
11//!
12//! - A plugin extractor receives only a `ValidatedPath` — it cannot
13//!   request arbitrary filesystem access. The PathGuard boundary
14//!   (RFC-003 §8) applies before any plugin receives a path.
15//! - Plugin failures are isolated: a panic in a plugin extractor must
16//!   not crash the orbok process (RFC-005 §13).
17//! - User consent is required before a non-built-in plugin is used;
18//!   the manifest provides the metadata for that consent dialog.
19//! - Plugin logging must follow NFR-014: no document contents logged.
20//!
21//! ## Dynamic loading (future)
22//!
23//! When RFC-028 is fully activated, plugin `.so`/`.dll` files will be
24//! located via the `PluginRegistry`. Until then, `PluginRegistry` only
25//! holds the built-in extractors.
26
27use crate::types::DocumentExtractor;
28
29/// Metadata attached to every extractor plugin for display and consent.
30#[derive(Debug, Clone)]
31pub struct PluginManifest {
32    /// Stable identifier (e.g. `"excel-xlsx-v1"`). Must be unique.
33    pub plugin_id: &'static str,
34    /// Human-readable display name.
35    pub display_name: &'static str,
36    /// Comma-separated list of handled file extensions.
37    pub extensions: &'static [&'static str],
38    /// Author name.
39    pub author: &'static str,
40    /// License (user sees this in the consent dialog).
41    pub license: &'static str,
42    /// Whether this plugin is built-in (no user consent required) or
43    /// external (user must explicitly allow).
44    pub builtin: bool,
45    /// Privacy statement: what the plugin does NOT do.
46    pub privacy_note: &'static str,
47}
48
49/// A plugin extractor: manifest metadata + the extraction implementation.
50pub struct PluginExtractor {
51    pub manifest: PluginManifest,
52    pub extractor: Box<dyn DocumentExtractor>,
53}
54
55impl PluginExtractor {
56    /// Wrap a built-in extractor with its manifest.
57    pub fn builtin(manifest: PluginManifest, extractor: Box<dyn DocumentExtractor>) -> Self {
58        debug_assert!(
59            manifest.builtin,
60            "use PluginExtractor::external for non-built-in plugins"
61        );
62        Self {
63            manifest,
64            extractor,
65        }
66    }
67}
68
69/// The plugin registry (RFC-028 §8).
70///
71/// In v0.8, only built-in plugins are registered. Dynamic loading is
72/// gated behind `RFC-028` being fully activated.
73pub struct PluginRegistry {
74    plugins: Vec<PluginExtractor>,
75}
76
77impl Default for PluginRegistry {
78    fn default() -> Self {
79        use crate::docx::DocxExtractor;
80        use crate::html::HtmlExtractor;
81        use crate::markdown::MarkdownExtractor;
82        use crate::pdf::PdfExtractor;
83        use crate::text::PlainTextExtractor;
84        let mut reg = Self {
85            plugins: Vec::new(),
86        };
87        reg.register_builtin(
88            PluginManifest {
89                plugin_id: "docx-v1",
90                display_name: "Microsoft Word (DOCX)",
91                extensions: &["docx"],
92                author: "orbok built-in",
93                license: "Apache-2.0",
94                builtin: true,
95                privacy_note: "Does not transmit content externally.",
96            },
97            Box::new(DocxExtractor),
98        );
99        reg.register_builtin(
100            PluginManifest {
101                plugin_id: "html-v1",
102                display_name: "HTML",
103                extensions: &["html", "htm"],
104                author: "orbok built-in",
105                license: "Apache-2.0",
106                builtin: true,
107                privacy_note: "Does not transmit content externally.",
108            },
109            Box::new(HtmlExtractor),
110        );
111        reg.register_builtin(
112            PluginManifest {
113                plugin_id: "markdown-v1",
114                display_name: "Markdown",
115                extensions: &["md", "markdown"],
116                author: "orbok built-in",
117                license: "Apache-2.0",
118                builtin: true,
119                privacy_note: "Does not transmit content externally.",
120            },
121            Box::new(MarkdownExtractor),
122        );
123        reg.register_builtin(
124            PluginManifest {
125                plugin_id: "plain-text-v1",
126                display_name: "Plain Text",
127                extensions: &[
128                    "txt", "log", "rs", "py", "js", "ts", "go", "sql", "toml", "yaml", "yml",
129                    "json", "xml", "css", "html", "htm",
130                ],
131                author: "orbok built-in",
132                license: "Apache-2.0",
133                builtin: true,
134                privacy_note: "Does not transmit content externally.",
135            },
136            Box::new(PlainTextExtractor),
137        );
138        reg.register_builtin(
139            PluginManifest {
140                plugin_id: "pdf-lopdf-v1",
141                display_name: "PDF (lopdf)",
142                extensions: &["pdf"],
143                author: "orbok built-in",
144                license: "Apache-2.0",
145                builtin: true,
146                privacy_note: "Extracts text locally. Does not transmit content externally.",
147            },
148            Box::new(PdfExtractor),
149        );
150        reg
151    }
152}
153
154impl PluginRegistry {
155    fn register_builtin(
156        &mut self,
157        manifest: PluginManifest,
158        extractor: Box<dyn DocumentExtractor>,
159    ) {
160        self.plugins
161            .push(PluginExtractor::builtin(manifest, extractor));
162    }
163
164    /// Find the plugin that handles the given extension.
165    pub fn find_for_extension(&self, ext: &str) -> Option<&PluginExtractor> {
166        let ext_lower = ext.to_ascii_lowercase();
167        self.plugins
168            .iter()
169            .find(|p| p.manifest.extensions.contains(&ext_lower.as_str()))
170    }
171
172    /// All registered plugin manifests (for the Models/Settings view).
173    pub fn manifests(&self) -> Vec<&PluginManifest> {
174        self.plugins.iter().map(|p| &p.manifest).collect()
175    }
176
177    /// Number of registered plugins.
178    pub fn len(&self) -> usize {
179        self.plugins.len()
180    }
181}