use crate::types::DocumentExtractor;
#[derive(Debug, Clone)]
pub struct PluginManifest {
pub plugin_id: &'static str,
pub display_name: &'static str,
pub extensions: &'static [&'static str],
pub author: &'static str,
pub license: &'static str,
pub builtin: bool,
pub privacy_note: &'static str,
}
pub struct PluginExtractor {
pub manifest: PluginManifest,
pub extractor: Box<dyn DocumentExtractor>,
}
impl PluginExtractor {
pub fn builtin(manifest: PluginManifest, extractor: Box<dyn DocumentExtractor>) -> Self {
debug_assert!(
manifest.builtin,
"use PluginExtractor::external for non-built-in plugins"
);
Self {
manifest,
extractor,
}
}
}
pub struct PluginRegistry {
plugins: Vec<PluginExtractor>,
}
impl Default for PluginRegistry {
fn default() -> Self {
use crate::docx::DocxExtractor;
use crate::html::HtmlExtractor;
use crate::markdown::MarkdownExtractor;
use crate::pdf::PdfExtractor;
use crate::text::PlainTextExtractor;
let mut reg = Self {
plugins: Vec::new(),
};
reg.register_builtin(
PluginManifest {
plugin_id: "docx-v1",
display_name: "Microsoft Word (DOCX)",
extensions: &["docx"],
author: "orbok built-in",
license: "Apache-2.0",
builtin: true,
privacy_note: "Does not transmit content externally.",
},
Box::new(DocxExtractor),
);
reg.register_builtin(
PluginManifest {
plugin_id: "html-v1",
display_name: "HTML",
extensions: &["html", "htm"],
author: "orbok built-in",
license: "Apache-2.0",
builtin: true,
privacy_note: "Does not transmit content externally.",
},
Box::new(HtmlExtractor),
);
reg.register_builtin(
PluginManifest {
plugin_id: "markdown-v1",
display_name: "Markdown",
extensions: &["md", "markdown"],
author: "orbok built-in",
license: "Apache-2.0",
builtin: true,
privacy_note: "Does not transmit content externally.",
},
Box::new(MarkdownExtractor),
);
reg.register_builtin(
PluginManifest {
plugin_id: "plain-text-v1",
display_name: "Plain Text",
extensions: &[
"txt", "log", "rs", "py", "js", "ts", "go", "sql", "toml", "yaml", "yml",
"json", "xml", "css", "html", "htm",
],
author: "orbok built-in",
license: "Apache-2.0",
builtin: true,
privacy_note: "Does not transmit content externally.",
},
Box::new(PlainTextExtractor),
);
reg.register_builtin(
PluginManifest {
plugin_id: "pdf-lopdf-v1",
display_name: "PDF (lopdf)",
extensions: &["pdf"],
author: "orbok built-in",
license: "Apache-2.0",
builtin: true,
privacy_note: "Extracts text locally. Does not transmit content externally.",
},
Box::new(PdfExtractor),
);
reg
}
}
impl PluginRegistry {
fn register_builtin(
&mut self,
manifest: PluginManifest,
extractor: Box<dyn DocumentExtractor>,
) {
self.plugins
.push(PluginExtractor::builtin(manifest, extractor));
}
pub fn find_for_extension(&self, ext: &str) -> Option<&PluginExtractor> {
let ext_lower = ext.to_ascii_lowercase();
self.plugins
.iter()
.find(|p| p.manifest.extensions.contains(&ext_lower.as_str()))
}
pub fn manifests(&self) -> Vec<&PluginManifest> {
self.plugins.iter().map(|p| &p.manifest).collect()
}
pub fn len(&self) -> usize {
self.plugins.len()
}
}