Skip to main content

memvid_core/reader/
mod.rs

1//! Document reader traits and registry for unified format ingestion.
2
3mod docx;
4mod passthrough;
5mod pdf;
6mod pptx;
7mod xls;
8mod xlsx;
9pub(crate) mod xlsx_chunker;
10pub(crate) mod xlsx_ooxml;
11pub(crate) mod xlsx_table_detect;
12
13use serde_json::Value;
14
15pub use docx::DocxReader;
16pub use passthrough::PassthroughReader;
17pub use pdf::PdfReader;
18pub use pptx::PptxReader;
19pub use xls::XlsReader;
20pub use xlsx::{XlsxReader, XlsxStructuredDiagnostics, XlsxStructuredResult};
21pub use xlsx_chunker::XlsxChunkingOptions;
22pub use xlsx_table_detect::DetectedTable;
23
24use crate::{ExtractedDocument, Result};
25
26/// Soft classification of document formats used by the ingestion router.
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
28pub enum DocumentFormat {
29    Pdf,
30    Docx,
31    Xlsx,
32    Xls,
33    Pptx,
34    PlainText,
35    Markdown,
36    Html,
37    Jsonl,
38    Unknown,
39}
40
41impl DocumentFormat {
42    #[must_use]
43    pub fn label(self) -> &'static str {
44        match self {
45            Self::Pdf => "pdf",
46            Self::Docx => "docx",
47            Self::Xlsx => "xlsx",
48            Self::Xls => "xls",
49            Self::Pptx => "pptx",
50            Self::PlainText => "text",
51            Self::Markdown => "markdown",
52            Self::Html => "html",
53            Self::Jsonl => "jsonl",
54            Self::Unknown => "unknown",
55        }
56    }
57}
58
59/// Hint provided to readers before probing/extraction.
60#[derive(Debug, Clone)]
61pub struct ReaderHint<'a> {
62    pub mime: Option<&'a str>,
63    pub format: Option<DocumentFormat>,
64    pub uri: Option<&'a str>,
65    pub magic_bytes: Option<&'a [u8]>,
66}
67
68impl<'a> ReaderHint<'a> {
69    #[must_use]
70    pub fn new(mime: Option<&'a str>, format: Option<DocumentFormat>) -> Self {
71        Self {
72            mime,
73            format,
74            uri: None,
75            magic_bytes: None,
76        }
77    }
78
79    #[must_use]
80    pub fn with_uri(mut self, uri: Option<&'a str>) -> Self {
81        self.uri = uri;
82        self
83    }
84
85    #[must_use]
86    pub fn with_magic(mut self, magic: Option<&'a [u8]>) -> Self {
87        self.magic_bytes = magic;
88        self
89    }
90}
91
92/// Structured text and metadata extracted from a document, plus routing diagnostics.
93#[derive(Debug, Clone)]
94pub struct ReaderOutput {
95    pub document: ExtractedDocument,
96    pub reader_name: String,
97    pub diagnostics: ReaderDiagnostics,
98}
99
100impl ReaderOutput {
101    #[must_use]
102    pub fn new(document: ExtractedDocument, reader_name: impl Into<String>) -> Self {
103        Self {
104            document,
105            reader_name: reader_name.into(),
106            diagnostics: ReaderDiagnostics::default(),
107        }
108    }
109
110    #[must_use]
111    pub fn with_diagnostics(mut self, diagnostics: ReaderDiagnostics) -> Self {
112        self.diagnostics = diagnostics;
113        self
114    }
115}
116
117/// Metadata about a reader attempt used for observability and surfacing warnings.
118#[derive(Debug, Clone, Default)]
119pub struct ReaderDiagnostics {
120    pub warnings: Vec<String>,
121    pub fallback: bool,
122    pub extra_metadata: Value,
123    pub duration_ms: Option<u64>,
124    pub pages_processed: Option<u32>,
125}
126
127impl ReaderDiagnostics {
128    pub fn record_warning<S: Into<String>>(&mut self, warning: S) {
129        self.warnings.push(warning.into());
130    }
131
132    pub fn mark_fallback(&mut self) {
133        self.fallback = true;
134    }
135
136    #[must_use]
137    pub fn with_metadata(mut self, value: Value) -> Self {
138        self.extra_metadata = value;
139        self
140    }
141
142    pub fn merge_from(&mut self, other: &ReaderDiagnostics) {
143        self.warnings.extend(other.warnings.iter().cloned());
144        if other.fallback {
145            self.fallback = true;
146        }
147        if !other.extra_metadata.is_null() {
148            self.extra_metadata = other.extra_metadata.clone();
149        }
150        if other.duration_ms.is_some() {
151            self.duration_ms = other.duration_ms;
152        }
153        if other.pages_processed.is_some() {
154            self.pages_processed = other.pages_processed;
155        }
156    }
157
158    pub fn track_warning<S: Into<String>>(&mut self, warning: S) {
159        self.warnings.push(warning.into());
160        self.fallback = true;
161    }
162}
163
164/// Trait implemented by document readers that can extract text from supported formats.
165pub trait DocumentReader: Send + Sync {
166    /// Human-readable name used for diagnostics (e.g., "`document_processor`", "pdfium").
167    fn name(&self) -> &'static str;
168
169    /// Return true if this reader is a good match for the provided hint.
170    fn supports(&self, hint: &ReaderHint<'_>) -> bool;
171
172    /// Extract text and metadata from the provided bytes.
173    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput>;
174}
175
176/// Registry of document readers used by the ingestion router.
177pub struct ReaderRegistry {
178    readers: Vec<Box<dyn DocumentReader>>,
179}
180
181impl ReaderRegistry {
182    #[must_use]
183    pub fn new() -> Self {
184        Self {
185            readers: Vec::new(),
186        }
187    }
188
189    pub fn register<R>(&mut self, reader: R)
190    where
191        R: DocumentReader + 'static,
192    {
193        self.readers.push(Box::new(reader));
194    }
195
196    #[must_use]
197    pub fn readers(&self) -> &[Box<dyn DocumentReader>] {
198        &self.readers
199    }
200
201    pub fn find_reader<'a>(&'a self, hint: &ReaderHint<'_>) -> Option<&'a dyn DocumentReader> {
202        self.readers
203            .iter()
204            .map(std::convert::AsRef::as_ref)
205            .find(|reader| reader.supports(hint))
206    }
207}
208
209impl Default for ReaderRegistry {
210    fn default() -> Self {
211        let mut registry = Self::new();
212        registry.register(PdfReader);
213        registry.register(DocxReader);
214        registry.register(XlsxReader);
215        registry.register(XlsReader);
216        registry.register(PptxReader);
217        registry.register(PassthroughReader);
218        registry
219    }
220}