Skip to main content

memvid_core/reader/
mod.rs

1//! Document reader traits and registry for unified format ingestion.
2
3mod docx;
4mod passthrough;
5mod pdf;
6mod pptx;
7mod xls;
8mod xlsx;
9
10use serde_json::Value;
11
12pub use docx::DocxReader;
13pub use passthrough::PassthroughReader;
14pub use pdf::PdfReader;
15pub use pptx::PptxReader;
16pub use xls::XlsReader;
17pub use xlsx::XlsxReader;
18
19use crate::{ExtractedDocument, Result};
20
21/// Soft classification of document formats used by the ingestion router.
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
23pub enum DocumentFormat {
24    Pdf,
25    Docx,
26    Xlsx,
27    Xls,
28    Pptx,
29    PlainText,
30    Markdown,
31    Html,
32    Jsonl,
33    Unknown,
34}
35
36impl DocumentFormat {
37    #[must_use]
38    pub fn label(self) -> &'static str {
39        match self {
40            Self::Pdf => "pdf",
41            Self::Docx => "docx",
42            Self::Xlsx => "xlsx",
43            Self::Xls => "xls",
44            Self::Pptx => "pptx",
45            Self::PlainText => "text",
46            Self::Markdown => "markdown",
47            Self::Html => "html",
48            Self::Jsonl => "jsonl",
49            Self::Unknown => "unknown",
50        }
51    }
52}
53
54/// Hint provided to readers before probing/extraction.
55#[derive(Debug, Clone)]
56pub struct ReaderHint<'a> {
57    pub mime: Option<&'a str>,
58    pub format: Option<DocumentFormat>,
59    pub uri: Option<&'a str>,
60    pub magic_bytes: Option<&'a [u8]>,
61}
62
63impl<'a> ReaderHint<'a> {
64    #[must_use]
65    pub fn new(mime: Option<&'a str>, format: Option<DocumentFormat>) -> Self {
66        Self {
67            mime,
68            format,
69            uri: None,
70            magic_bytes: None,
71        }
72    }
73
74    #[must_use]
75    pub fn with_uri(mut self, uri: Option<&'a str>) -> Self {
76        self.uri = uri;
77        self
78    }
79
80    #[must_use]
81    pub fn with_magic(mut self, magic: Option<&'a [u8]>) -> Self {
82        self.magic_bytes = magic;
83        self
84    }
85}
86
87/// Structured text and metadata extracted from a document, plus routing diagnostics.
88#[derive(Debug, Clone)]
89pub struct ReaderOutput {
90    pub document: ExtractedDocument,
91    pub reader_name: String,
92    pub diagnostics: ReaderDiagnostics,
93}
94
95impl ReaderOutput {
96    #[must_use]
97    pub fn new(document: ExtractedDocument, reader_name: impl Into<String>) -> Self {
98        Self {
99            document,
100            reader_name: reader_name.into(),
101            diagnostics: ReaderDiagnostics::default(),
102        }
103    }
104
105    #[must_use]
106    pub fn with_diagnostics(mut self, diagnostics: ReaderDiagnostics) -> Self {
107        self.diagnostics = diagnostics;
108        self
109    }
110}
111
112/// Metadata about a reader attempt used for observability and surfacing warnings.
113#[derive(Debug, Clone, Default)]
114pub struct ReaderDiagnostics {
115    pub warnings: Vec<String>,
116    pub fallback: bool,
117    pub extra_metadata: Value,
118    pub duration_ms: Option<u64>,
119    pub pages_processed: Option<u32>,
120}
121
122impl ReaderDiagnostics {
123    pub fn record_warning<S: Into<String>>(&mut self, warning: S) {
124        self.warnings.push(warning.into());
125    }
126
127    pub fn mark_fallback(&mut self) {
128        self.fallback = true;
129    }
130
131    #[must_use]
132    pub fn with_metadata(mut self, value: Value) -> Self {
133        self.extra_metadata = value;
134        self
135    }
136
137    pub fn merge_from(&mut self, other: &ReaderDiagnostics) {
138        self.warnings.extend(other.warnings.iter().cloned());
139        if other.fallback {
140            self.fallback = true;
141        }
142        if !other.extra_metadata.is_null() {
143            self.extra_metadata = other.extra_metadata.clone();
144        }
145        if other.duration_ms.is_some() {
146            self.duration_ms = other.duration_ms;
147        }
148        if other.pages_processed.is_some() {
149            self.pages_processed = other.pages_processed;
150        }
151    }
152
153    pub fn track_warning<S: Into<String>>(&mut self, warning: S) {
154        self.warnings.push(warning.into());
155        self.fallback = true;
156    }
157}
158
159/// Trait implemented by document readers that can extract text from supported formats.
160pub trait DocumentReader: Send + Sync {
161    /// Human-readable name used for diagnostics (e.g., "`document_processor`", "pdfium").
162    fn name(&self) -> &'static str;
163
164    /// Return true if this reader is a good match for the provided hint.
165    fn supports(&self, hint: &ReaderHint<'_>) -> bool;
166
167    /// Extract text and metadata from the provided bytes.
168    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput>;
169}
170
171/// Registry of document readers used by the ingestion router.
172pub struct ReaderRegistry {
173    readers: Vec<Box<dyn DocumentReader>>,
174}
175
176impl ReaderRegistry {
177    #[must_use]
178    pub fn new() -> Self {
179        Self {
180            readers: Vec::new(),
181        }
182    }
183
184    pub fn register<R>(&mut self, reader: R)
185    where
186        R: DocumentReader + 'static,
187    {
188        self.readers.push(Box::new(reader));
189    }
190
191    #[must_use]
192    pub fn readers(&self) -> &[Box<dyn DocumentReader>] {
193        &self.readers
194    }
195
196    pub fn find_reader<'a>(&'a self, hint: &ReaderHint<'_>) -> Option<&'a dyn DocumentReader> {
197        self.readers
198            .iter()
199            .map(std::convert::AsRef::as_ref)
200            .find(|reader| reader.supports(hint))
201    }
202}
203
204impl Default for ReaderRegistry {
205    fn default() -> Self {
206        let mut registry = Self::new();
207        registry.register(PdfReader);
208        registry.register(DocxReader);
209        registry.register(XlsxReader);
210        registry.register(XlsReader);
211        registry.register(PptxReader);
212        registry.register(PassthroughReader);
213        registry
214    }
215}