Skip to main content

memvid_core/reader/
mod.rs

1//! Document reader traits and registry for unified format ingestion.
2
3mod docx;
4mod passthrough;
5mod pdf;
6mod pptx;
7mod xls;
8mod xlsx;
9
10use serde_json::Value;
11
12pub use docx::DocxReader;
13pub use passthrough::PassthroughReader;
14pub use pdf::PdfReader;
15pub use pptx::PptxReader;
16pub use xls::XlsReader;
17pub use xlsx::XlsxReader;
18
19use crate::{ExtractedDocument, Result};
20
21/// Soft classification of document formats used by the ingestion router.
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
23pub enum DocumentFormat {
24    Pdf,
25    Docx,
26    Xlsx,
27    Xls,
28    Pptx,
29    PlainText,
30    Markdown,
31    Html,
32    Unknown,
33}
34
35impl DocumentFormat {
36    #[must_use]
37    pub fn label(self) -> &'static str {
38        match self {
39            Self::Pdf => "pdf",
40            Self::Docx => "docx",
41            Self::Xlsx => "xlsx",
42            Self::Xls => "xls",
43            Self::Pptx => "pptx",
44            Self::PlainText => "text",
45            Self::Markdown => "markdown",
46            Self::Html => "html",
47            Self::Unknown => "unknown",
48        }
49    }
50}
51
52/// Hint provided to readers before probing/extraction.
53#[derive(Debug, Clone)]
54pub struct ReaderHint<'a> {
55    pub mime: Option<&'a str>,
56    pub format: Option<DocumentFormat>,
57    pub uri: Option<&'a str>,
58    pub magic_bytes: Option<&'a [u8]>,
59}
60
61impl<'a> ReaderHint<'a> {
62    #[must_use]
63    pub fn new(mime: Option<&'a str>, format: Option<DocumentFormat>) -> Self {
64        Self {
65            mime,
66            format,
67            uri: None,
68            magic_bytes: None,
69        }
70    }
71
72    #[must_use]
73    pub fn with_uri(mut self, uri: Option<&'a str>) -> Self {
74        self.uri = uri;
75        self
76    }
77
78    #[must_use]
79    pub fn with_magic(mut self, magic: Option<&'a [u8]>) -> Self {
80        self.magic_bytes = magic;
81        self
82    }
83}
84
85/// Structured text and metadata extracted from a document, plus routing diagnostics.
86#[derive(Debug, Clone)]
87pub struct ReaderOutput {
88    pub document: ExtractedDocument,
89    pub reader_name: String,
90    pub diagnostics: ReaderDiagnostics,
91}
92
93impl ReaderOutput {
94    #[must_use]
95    pub fn new(document: ExtractedDocument, reader_name: impl Into<String>) -> Self {
96        Self {
97            document,
98            reader_name: reader_name.into(),
99            diagnostics: ReaderDiagnostics::default(),
100        }
101    }
102
103    #[must_use]
104    pub fn with_diagnostics(mut self, diagnostics: ReaderDiagnostics) -> Self {
105        self.diagnostics = diagnostics;
106        self
107    }
108}
109
110/// Metadata about a reader attempt used for observability and surfacing warnings.
111#[derive(Debug, Clone, Default)]
112pub struct ReaderDiagnostics {
113    pub warnings: Vec<String>,
114    pub fallback: bool,
115    pub extra_metadata: Value,
116    pub duration_ms: Option<u64>,
117    pub pages_processed: Option<u32>,
118}
119
120impl ReaderDiagnostics {
121    pub fn record_warning<S: Into<String>>(&mut self, warning: S) {
122        self.warnings.push(warning.into());
123    }
124
125    pub fn mark_fallback(&mut self) {
126        self.fallback = true;
127    }
128
129    #[must_use]
130    pub fn with_metadata(mut self, value: Value) -> Self {
131        self.extra_metadata = value;
132        self
133    }
134
135    pub fn merge_from(&mut self, other: &ReaderDiagnostics) {
136        self.warnings.extend(other.warnings.iter().cloned());
137        if other.fallback {
138            self.fallback = true;
139        }
140        if !other.extra_metadata.is_null() {
141            self.extra_metadata = other.extra_metadata.clone();
142        }
143        if other.duration_ms.is_some() {
144            self.duration_ms = other.duration_ms;
145        }
146        if other.pages_processed.is_some() {
147            self.pages_processed = other.pages_processed;
148        }
149    }
150
151    pub fn track_warning<S: Into<String>>(&mut self, warning: S) {
152        self.warnings.push(warning.into());
153        self.fallback = true;
154    }
155}
156
157/// Trait implemented by document readers that can extract text from supported formats.
158pub trait DocumentReader: Send + Sync {
159    /// Human-readable name used for diagnostics (e.g., "`document_processor`", "pdfium").
160    fn name(&self) -> &'static str;
161
162    /// Return true if this reader is a good match for the provided hint.
163    fn supports(&self, hint: &ReaderHint<'_>) -> bool;
164
165    /// Extract text and metadata from the provided bytes.
166    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput>;
167}
168
169/// Registry of document readers used by the ingestion router.
170pub struct ReaderRegistry {
171    readers: Vec<Box<dyn DocumentReader>>,
172}
173
174impl ReaderRegistry {
175    #[must_use]
176    pub fn new() -> Self {
177        Self {
178            readers: Vec::new(),
179        }
180    }
181
182    pub fn register<R>(&mut self, reader: R)
183    where
184        R: DocumentReader + 'static,
185    {
186        self.readers.push(Box::new(reader));
187    }
188
189    #[must_use]
190    pub fn readers(&self) -> &[Box<dyn DocumentReader>] {
191        &self.readers
192    }
193
194    pub fn find_reader<'a>(&'a self, hint: &ReaderHint<'_>) -> Option<&'a dyn DocumentReader> {
195        self.readers
196            .iter()
197            .map(std::convert::AsRef::as_ref)
198            .find(|reader| reader.supports(hint))
199    }
200}
201
202impl Default for ReaderRegistry {
203    fn default() -> Self {
204        let mut registry = Self::new();
205        registry.register(PdfReader);
206        registry.register(DocxReader);
207        registry.register(XlsxReader);
208        registry.register(XlsReader);
209        registry.register(PptxReader);
210        registry.register(PassthroughReader);
211        registry
212    }
213}