use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::plugins::registry::get_document_extractor_registry;
use crate::types::ExtractionResult;
use once_cell::sync::Lazy;
use std::sync::Arc;
pub trait SyncExtractor {
fn extract_sync(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult>;
}
pub mod csv;
pub mod structured;
pub mod text;
pub mod djot_format;
pub mod frontmatter_utils;
pub(crate) mod markdown_utils;
#[cfg(feature = "archives")]
pub mod security;
#[cfg(any(feature = "ocr", feature = "ocr-wasm"))]
pub mod image;
#[cfg(feature = "archives")]
pub mod archive;
#[cfg(feature = "email")]
pub mod email;
#[cfg(feature = "email")]
pub mod pst;
#[cfg(any(feature = "excel", feature = "excel-wasm"))]
pub mod excel;
#[cfg(feature = "hwp")]
pub mod hwp;
#[cfg(feature = "iwork")]
pub mod iwork;
#[cfg(feature = "html")]
pub mod html;
#[cfg(feature = "office")]
pub mod bibtex;
#[cfg(feature = "office")]
pub mod citation;
#[cfg(feature = "office")]
pub mod doc;
#[cfg(feature = "office")]
pub mod dbf;
#[cfg(feature = "office")]
pub mod docx;
#[cfg(feature = "office")]
pub mod epub;
#[cfg(feature = "office")]
pub mod fictionbook;
pub mod markdown;
#[cfg(feature = "mdx")]
pub mod mdx;
#[cfg(feature = "office")]
pub mod rst;
#[cfg(feature = "office")]
pub mod latex;
#[cfg(feature = "office")]
pub mod jupyter;
#[cfg(feature = "office")]
pub mod orgmode;
#[cfg(feature = "office")]
pub mod odt;
#[cfg(feature = "office")]
pub mod opml;
#[cfg(feature = "office")]
pub mod typst;
#[cfg(feature = "xml")]
pub mod jats;
#[cfg(feature = "pdf")]
pub mod pdf;
#[cfg(feature = "office")]
pub mod ppt;
#[cfg(feature = "office")]
pub mod pptx;
#[cfg(feature = "office")]
pub mod rtf;
#[cfg(feature = "xml")]
pub mod xml;
#[cfg(feature = "xml")]
pub mod docbook;
pub use csv::CsvExtractor;
pub use markdown::MarkdownExtractor;
pub use structured::StructuredExtractor;
pub use text::PlainTextExtractor;
#[cfg(any(feature = "ocr", feature = "ocr-wasm"))]
pub use image::ImageExtractor;
#[cfg(feature = "archives")]
pub use archive::{GzipExtractor, SevenZExtractor, TarExtractor, ZipExtractor};
#[cfg(feature = "email")]
pub use email::EmailExtractor;
#[cfg(feature = "email")]
pub use pst::PstExtractor;
#[cfg(any(feature = "excel", feature = "excel-wasm"))]
pub use excel::ExcelExtractor;
#[cfg(feature = "hwp")]
pub use hwp::HwpExtractor;
#[cfg(feature = "iwork")]
pub use iwork::{keynote::KeynoteExtractor, numbers::NumbersExtractor, pages::PagesExtractor};
#[cfg(feature = "html")]
pub use html::HtmlExtractor;
#[cfg(feature = "office")]
pub use bibtex::BibtexExtractor;
#[cfg(feature = "office")]
pub use citation::CitationExtractor;
#[cfg(feature = "office")]
pub use dbf::DbfExtractor;
#[cfg(feature = "office")]
pub use doc::DocExtractor;
#[cfg(feature = "office")]
pub use docx::DocxExtractor;
#[cfg(feature = "office")]
pub use epub::EpubExtractor;
#[cfg(feature = "office")]
pub use fictionbook::FictionBookExtractor;
pub use djot_format::DjotExtractor;
#[cfg(feature = "mdx")]
pub use mdx::MdxExtractor;
#[cfg(feature = "office")]
pub use rst::RstExtractor;
#[cfg(feature = "office")]
pub use latex::LatexExtractor;
#[cfg(feature = "office")]
pub use jupyter::JupyterExtractor;
#[cfg(feature = "office")]
pub use orgmode::OrgModeExtractor;
#[cfg(feature = "office")]
pub use odt::OdtExtractor;
#[cfg(feature = "xml")]
pub use jats::JatsExtractor;
#[cfg(feature = "office")]
pub use opml::OpmlExtractor;
#[cfg(feature = "office")]
pub use typst::TypstExtractor;
#[cfg(feature = "pdf")]
pub use pdf::PdfExtractor;
#[cfg(feature = "office")]
pub use ppt::PptExtractor;
#[cfg(feature = "office")]
pub use pptx::PptxExtractor;
#[cfg(feature = "office")]
pub use rtf::RtfExtractor;
#[cfg(feature = "xml")]
pub use xml::XmlExtractor;
#[cfg(feature = "xml")]
pub use docbook::DocbookExtractor;
static EXTRACTORS_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_default_extractors);
pub fn ensure_initialized() -> Result<()> {
EXTRACTORS_INITIALIZED
.as_ref()
.map(|_| ())
.map_err(|e| crate::KreuzbergError::Plugin {
message: format!("Failed to register default extractors: {}", e),
plugin_name: "built-in-extractors".to_string(),
})?;
let registry = get_document_extractor_registry();
let registry_guard = registry.read();
if registry_guard.list().is_empty() {
drop(registry_guard);
register_default_extractors()?;
}
Ok(())
}
pub fn register_default_extractors() -> Result<()> {
let registry = get_document_extractor_registry();
let mut registry = registry.write();
registry.register(Arc::new(PlainTextExtractor::new()))?;
registry.register(Arc::new(MarkdownExtractor::new()))?;
registry.register(Arc::new(StructuredExtractor::new()))?;
registry.register(Arc::new(CsvExtractor::new()))?;
#[cfg(any(feature = "ocr", feature = "ocr-wasm"))]
registry.register(Arc::new(ImageExtractor::new()))?;
#[cfg(feature = "xml")]
{
registry.register(Arc::new(XmlExtractor::new()))?;
registry.register(Arc::new(JatsExtractor::new()))?;
registry.register(Arc::new(DocbookExtractor::new()))?;
}
#[cfg(feature = "pdf")]
registry.register(Arc::new(PdfExtractor::new()))?;
#[cfg(any(feature = "excel", feature = "excel-wasm"))]
registry.register(Arc::new(ExcelExtractor::new()))?;
registry.register(Arc::new(DjotExtractor::new()))?;
#[cfg(feature = "office")]
{
registry.register(Arc::new(BibtexExtractor::new()))?;
registry.register(Arc::new(CitationExtractor::new()))?;
registry.register(Arc::new(EpubExtractor::new()))?;
registry.register(Arc::new(FictionBookExtractor::new()))?;
registry.register(Arc::new(RtfExtractor::new()))?;
registry.register(Arc::new(RstExtractor::new()))?;
registry.register(Arc::new(LatexExtractor::new()))?;
registry.register(Arc::new(JupyterExtractor::new()))?;
registry.register(Arc::new(OrgModeExtractor::new()))?;
registry.register(Arc::new(OpmlExtractor::new()))?;
registry.register(Arc::new(TypstExtractor::new()))?;
registry.register(Arc::new(DocExtractor::new()))?;
registry.register(Arc::new(DocxExtractor::new()))?;
registry.register(Arc::new(PptExtractor::new()))?;
registry.register(Arc::new(PptxExtractor::new()))?;
registry.register(Arc::new(OdtExtractor::new()))?;
registry.register(Arc::new(DbfExtractor::new()))?;
}
#[cfg(feature = "hwp")]
{
registry.register(Arc::new(HwpExtractor::new()))?;
}
#[cfg(feature = "iwork")]
{
registry.register(Arc::new(PagesExtractor::new()))?;
registry.register(Arc::new(NumbersExtractor::new()))?;
registry.register(Arc::new(KeynoteExtractor::new()))?;
}
#[cfg(feature = "mdx")]
registry.register(Arc::new(MdxExtractor::new()))?;
#[cfg(feature = "email")]
{
registry.register(Arc::new(EmailExtractor::new()))?;
registry.register(Arc::new(PstExtractor::new()))?;
}
#[cfg(feature = "html")]
registry.register(Arc::new(HtmlExtractor::new()))?;
#[cfg(feature = "archives")]
{
registry.register(Arc::new(ZipExtractor::new()))?;
registry.register(Arc::new(TarExtractor::new()))?;
registry.register(Arc::new(SevenZExtractor::new()))?;
registry.register(Arc::new(GzipExtractor::new()))?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_register_default_extractors() {
let registry = get_document_extractor_registry();
{
let mut reg = registry.write();
*reg = crate::plugins::registry::DocumentExtractorRegistry::new();
}
register_default_extractors().expect("Failed to register extractors");
let reg = registry.read();
let extractor_names = reg.list();
#[allow(unused_mut)]
let mut expected_count = 5; assert!(extractor_names.contains(&"plain-text-extractor".to_string()));
assert!(extractor_names.contains(&"markdown-extractor".to_string()));
assert!(extractor_names.contains(&"structured-extractor".to_string()));
assert!(extractor_names.contains(&"djot-extractor".to_string()));
assert!(extractor_names.contains(&"csv-extractor".to_string()));
#[cfg(any(feature = "ocr", feature = "ocr-wasm"))]
{
expected_count += 1;
assert!(extractor_names.contains(&"image-extractor".to_string()));
}
#[cfg(feature = "xml")]
{
expected_count += 3;
assert!(extractor_names.contains(&"xml-extractor".to_string()));
assert!(extractor_names.contains(&"jats-extractor".to_string()));
assert!(extractor_names.contains(&"docbook-extractor".to_string()));
}
#[cfg(feature = "pdf")]
{
expected_count += 1;
assert!(extractor_names.contains(&"pdf-extractor".to_string()));
}
#[cfg(any(feature = "excel", feature = "excel-wasm"))]
{
expected_count += 1;
assert!(extractor_names.contains(&"excel-extractor".to_string()));
}
#[cfg(feature = "office")]
{
expected_count += 17;
assert!(extractor_names.contains(&"bibtex-extractor".to_string()));
assert!(extractor_names.contains(&"citation-extractor".to_string()));
assert!(extractor_names.contains(&"epub-extractor".to_string()));
assert!(extractor_names.contains(&"fictionbook-extractor".to_string()));
assert!(extractor_names.contains(&"rtf-extractor".to_string()));
assert!(extractor_names.contains(&"rst-extractor".to_string()));
assert!(extractor_names.contains(&"latex-extractor".to_string()));
assert!(extractor_names.contains(&"jupyter-extractor".to_string()));
assert!(extractor_names.contains(&"orgmode-extractor".to_string()));
assert!(extractor_names.contains(&"opml-extractor".to_string()));
assert!(extractor_names.contains(&"typst-extractor".to_string()));
assert!(extractor_names.contains(&"dbf-extractor".to_string()));
assert!(extractor_names.contains(&"doc-extractor".to_string()));
assert!(extractor_names.contains(&"docx-extractor".to_string()));
assert!(extractor_names.contains(&"ppt-extractor".to_string()));
assert!(extractor_names.contains(&"pptx-extractor".to_string()));
assert!(extractor_names.contains(&"odt-extractor".to_string()));
}
#[cfg(feature = "hwp")]
{
expected_count += 1;
assert!(extractor_names.contains(&"hwp-extractor".to_string()));
}
#[cfg(feature = "iwork")]
{
expected_count += 3;
assert!(extractor_names.contains(&"iwork-pages-extractor".to_string()));
assert!(extractor_names.contains(&"iwork-numbers-extractor".to_string()));
assert!(extractor_names.contains(&"iwork-keynote-extractor".to_string()));
}
#[cfg(feature = "mdx")]
{
expected_count += 1;
assert!(extractor_names.contains(&"mdx-extractor".to_string()));
}
#[cfg(feature = "email")]
{
expected_count += 2;
assert!(extractor_names.contains(&"email-extractor".to_string()));
assert!(extractor_names.contains(&"pst-extractor".to_string()));
}
#[cfg(feature = "html")]
{
expected_count += 1;
assert!(extractor_names.contains(&"html-extractor".to_string()));
}
#[cfg(feature = "archives")]
{
expected_count += 4;
assert!(extractor_names.contains(&"zip-extractor".to_string()));
assert!(extractor_names.contains(&"tar-extractor".to_string()));
assert!(extractor_names.contains(&"7z-extractor".to_string()));
assert!(extractor_names.contains(&"gzip-extractor".to_string()));
}
assert_eq!(
extractor_names.len(),
expected_count,
"Expected {} extractors based on enabled features",
expected_count
);
}
#[test]
fn test_ensure_initialized() {
ensure_initialized().expect("Failed to ensure extractors initialized");
}
}