memvid_core/reader/
mod.rs

1//! Document reader traits and registry for unified format ingestion.
2
3mod docx;
4mod passthrough;
5mod pdf;
6mod pptx;
7mod xls;
8mod xlsx;
9
10use serde_json::Value;
11
12pub use docx::DocxReader;
13pub use passthrough::PassthroughReader;
14pub use pdf::PdfReader;
15pub use pptx::PptxReader;
16pub use xls::XlsReader;
17pub use xlsx::XlsxReader;
18
19use crate::{ExtractedDocument, Result};
20
21/// Soft classification of document formats used by the ingestion router.
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
23pub enum DocumentFormat {
24    Pdf,
25    Docx,
26    Xlsx,
27    Xls,
28    Pptx,
29    PlainText,
30    Markdown,
31    Html,
32    Unknown,
33}
34
35impl DocumentFormat {
36    pub fn label(self) -> &'static str {
37        match self {
38            Self::Pdf => "pdf",
39            Self::Docx => "docx",
40            Self::Xlsx => "xlsx",
41            Self::Xls => "xls",
42            Self::Pptx => "pptx",
43            Self::PlainText => "text",
44            Self::Markdown => "markdown",
45            Self::Html => "html",
46            Self::Unknown => "unknown",
47        }
48    }
49}
50
51/// Hint provided to readers before probing/extraction.
52#[derive(Debug, Clone)]
53pub struct ReaderHint<'a> {
54    pub mime: Option<&'a str>,
55    pub format: Option<DocumentFormat>,
56    pub uri: Option<&'a str>,
57    pub magic_bytes: Option<&'a [u8]>,
58}
59
60impl<'a> ReaderHint<'a> {
61    #[must_use]
62    pub fn new(mime: Option<&'a str>, format: Option<DocumentFormat>) -> Self {
63        Self {
64            mime,
65            format,
66            uri: None,
67            magic_bytes: None,
68        }
69    }
70
71    #[must_use]
72    pub fn with_uri(mut self, uri: Option<&'a str>) -> Self {
73        self.uri = uri;
74        self
75    }
76
77    #[must_use]
78    pub fn with_magic(mut self, magic: Option<&'a [u8]>) -> Self {
79        self.magic_bytes = magic;
80        self
81    }
82}
83
84/// Structured text and metadata extracted from a document, plus routing diagnostics.
85#[derive(Debug, Clone)]
86pub struct ReaderOutput {
87    pub document: ExtractedDocument,
88    pub reader_name: String,
89    pub diagnostics: ReaderDiagnostics,
90}
91
92impl ReaderOutput {
93    #[must_use]
94    pub fn new(document: ExtractedDocument, reader_name: impl Into<String>) -> Self {
95        Self {
96            document,
97            reader_name: reader_name.into(),
98            diagnostics: ReaderDiagnostics::default(),
99        }
100    }
101
102    #[must_use]
103    pub fn with_diagnostics(mut self, diagnostics: ReaderDiagnostics) -> Self {
104        self.diagnostics = diagnostics;
105        self
106    }
107}
108
109/// Metadata about a reader attempt used for observability and surfacing warnings.
110#[derive(Debug, Clone, Default)]
111pub struct ReaderDiagnostics {
112    pub warnings: Vec<String>,
113    pub fallback: bool,
114    pub extra_metadata: Value,
115    pub duration_ms: Option<u64>,
116    pub pages_processed: Option<u32>,
117}
118
119impl ReaderDiagnostics {
120    pub fn record_warning<S: Into<String>>(&mut self, warning: S) {
121        self.warnings.push(warning.into());
122    }
123
124    pub fn mark_fallback(&mut self) {
125        self.fallback = true;
126    }
127
128    pub fn with_metadata(mut self, value: Value) -> Self {
129        self.extra_metadata = value;
130        self
131    }
132
133    pub fn merge_from(&mut self, other: &ReaderDiagnostics) {
134        self.warnings.extend(other.warnings.iter().cloned());
135        if other.fallback {
136            self.fallback = true;
137        }
138        if !other.extra_metadata.is_null() {
139            self.extra_metadata = other.extra_metadata.clone();
140        }
141        if other.duration_ms.is_some() {
142            self.duration_ms = other.duration_ms;
143        }
144        if other.pages_processed.is_some() {
145            self.pages_processed = other.pages_processed;
146        }
147    }
148
149    pub fn track_warning<S: Into<String>>(&mut self, warning: S) {
150        self.warnings.push(warning.into());
151        self.fallback = true;
152    }
153}
154
155/// Trait implemented by document readers that can extract text from supported formats.
156pub trait DocumentReader: Send + Sync {
157    /// Human-readable name used for diagnostics (e.g., "document_processor", "pdfium").
158    fn name(&self) -> &'static str;
159
160    /// Return true if this reader is a good match for the provided hint.
161    fn supports(&self, hint: &ReaderHint<'_>) -> bool;
162
163    /// Extract text and metadata from the provided bytes.
164    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput>;
165}
166
167/// Registry of document readers used by the ingestion router.
168pub struct ReaderRegistry {
169    readers: Vec<Box<dyn DocumentReader>>,
170}
171
172impl ReaderRegistry {
173    #[must_use]
174    pub fn new() -> Self {
175        Self {
176            readers: Vec::new(),
177        }
178    }
179
180    pub fn register<R>(&mut self, reader: R)
181    where
182        R: DocumentReader + 'static,
183    {
184        self.readers.push(Box::new(reader));
185    }
186
187    #[must_use]
188    pub fn readers(&self) -> &[Box<dyn DocumentReader>] {
189        &self.readers
190    }
191
192    pub fn find_reader<'a>(&'a self, hint: &ReaderHint<'_>) -> Option<&'a dyn DocumentReader> {
193        self.readers
194            .iter()
195            .map(std::convert::AsRef::as_ref)
196            .find(|reader| reader.supports(hint))
197    }
198}
199
200impl Default for ReaderRegistry {
201    fn default() -> Self {
202        let mut registry = Self::new();
203        registry.register(PdfReader);
204        registry.register(DocxReader);
205        registry.register(XlsxReader);
206        registry.register(XlsReader);
207        registry.register(PptxReader);
208        registry.register(PassthroughReader);
209        registry
210    }
211}