memvid_core/reader/
mod.rs

1//! Document reader traits and registry for unified format ingestion.
2
3mod docx;
4mod passthrough;
5mod pdf;
6mod pptx;
7mod xlsx;
8
9use serde_json::Value;
10
11pub use docx::DocxReader;
12pub use passthrough::PassthroughReader;
13pub use pdf::PdfReader;
14pub use pptx::PptxReader;
15pub use xlsx::XlsxReader;
16
17use crate::{ExtractedDocument, Result};
18
19/// Soft classification of document formats used by the ingestion router.
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
21pub enum DocumentFormat {
22    Pdf,
23    Docx,
24    Xlsx,
25    Pptx,
26    PlainText,
27    Markdown,
28    Html,
29    Unknown,
30}
31
32impl DocumentFormat {
33    pub fn label(self) -> &'static str {
34        match self {
35            Self::Pdf => "pdf",
36            Self::Docx => "docx",
37            Self::Xlsx => "xlsx",
38            Self::Pptx => "pptx",
39            Self::PlainText => "text",
40            Self::Markdown => "markdown",
41            Self::Html => "html",
42            Self::Unknown => "unknown",
43        }
44    }
45}
46
47/// Hint provided to readers before probing/extraction.
48#[derive(Debug, Clone)]
49pub struct ReaderHint<'a> {
50    pub mime: Option<&'a str>,
51    pub format: Option<DocumentFormat>,
52    pub uri: Option<&'a str>,
53    pub magic_bytes: Option<&'a [u8]>,
54}
55
56impl<'a> ReaderHint<'a> {
57    #[must_use]
58    pub fn new(mime: Option<&'a str>, format: Option<DocumentFormat>) -> Self {
59        Self {
60            mime,
61            format,
62            uri: None,
63            magic_bytes: None,
64        }
65    }
66
67    #[must_use]
68    pub fn with_uri(mut self, uri: Option<&'a str>) -> Self {
69        self.uri = uri;
70        self
71    }
72
73    #[must_use]
74    pub fn with_magic(mut self, magic: Option<&'a [u8]>) -> Self {
75        self.magic_bytes = magic;
76        self
77    }
78}
79
80/// Structured text and metadata extracted from a document, plus routing diagnostics.
81#[derive(Debug, Clone)]
82pub struct ReaderOutput {
83    pub document: ExtractedDocument,
84    pub reader_name: String,
85    pub diagnostics: ReaderDiagnostics,
86}
87
88impl ReaderOutput {
89    #[must_use]
90    pub fn new(document: ExtractedDocument, reader_name: impl Into<String>) -> Self {
91        Self {
92            document,
93            reader_name: reader_name.into(),
94            diagnostics: ReaderDiagnostics::default(),
95        }
96    }
97
98    #[must_use]
99    pub fn with_diagnostics(mut self, diagnostics: ReaderDiagnostics) -> Self {
100        self.diagnostics = diagnostics;
101        self
102    }
103}
104
105/// Metadata about a reader attempt used for observability and surfacing warnings.
106#[derive(Debug, Clone, Default)]
107pub struct ReaderDiagnostics {
108    pub warnings: Vec<String>,
109    pub fallback: bool,
110    pub extra_metadata: Value,
111    pub duration_ms: Option<u64>,
112    pub pages_processed: Option<u32>,
113}
114
115impl ReaderDiagnostics {
116    pub fn record_warning<S: Into<String>>(&mut self, warning: S) {
117        self.warnings.push(warning.into());
118    }
119
120    pub fn mark_fallback(&mut self) {
121        self.fallback = true;
122    }
123
124    pub fn with_metadata(mut self, value: Value) -> Self {
125        self.extra_metadata = value;
126        self
127    }
128
129    pub fn merge_from(&mut self, other: &ReaderDiagnostics) {
130        self.warnings.extend(other.warnings.iter().cloned());
131        if other.fallback {
132            self.fallback = true;
133        }
134        if !other.extra_metadata.is_null() {
135            self.extra_metadata = other.extra_metadata.clone();
136        }
137        if other.duration_ms.is_some() {
138            self.duration_ms = other.duration_ms;
139        }
140        if other.pages_processed.is_some() {
141            self.pages_processed = other.pages_processed;
142        }
143    }
144
145    pub fn track_warning<S: Into<String>>(&mut self, warning: S) {
146        self.warnings.push(warning.into());
147        self.fallback = true;
148    }
149}
150
151/// Trait implemented by document readers that can extract text from supported formats.
152pub trait DocumentReader: Send + Sync {
153    /// Human-readable name used for diagnostics (e.g., "document_processor", "pdfium").
154    fn name(&self) -> &'static str;
155
156    /// Return true if this reader is a good match for the provided hint.
157    fn supports(&self, hint: &ReaderHint<'_>) -> bool;
158
159    /// Extract text and metadata from the provided bytes.
160    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput>;
161}
162
163/// Registry of document readers used by the ingestion router.
164pub struct ReaderRegistry {
165    readers: Vec<Box<dyn DocumentReader>>,
166}
167
168impl ReaderRegistry {
169    #[must_use]
170    pub fn new() -> Self {
171        Self {
172            readers: Vec::new(),
173        }
174    }
175
176    pub fn register<R>(&mut self, reader: R)
177    where
178        R: DocumentReader + 'static,
179    {
180        self.readers.push(Box::new(reader));
181    }
182
183    #[must_use]
184    pub fn readers(&self) -> &[Box<dyn DocumentReader>] {
185        &self.readers
186    }
187
188    pub fn find_reader<'a>(&'a self, hint: &ReaderHint<'_>) -> Option<&'a dyn DocumentReader> {
189        self.readers
190            .iter()
191            .map(std::convert::AsRef::as_ref)
192            .find(|reader| reader.supports(hint))
193    }
194}
195
196impl Default for ReaderRegistry {
197    fn default() -> Self {
198        let mut registry = Self::new();
199        registry.register(PdfReader);
200        registry.register(DocxReader);
201        registry.register(XlsxReader);
202        registry.register(PptxReader);
203        registry.register(PassthroughReader);
204        registry
205    }
206}