memvid_core/reader/
mod.rs1mod docx;
4mod passthrough;
5mod pdf;
6mod pptx;
7mod xls;
8mod xlsx;
9
10use serde_json::Value;
11
12pub use docx::DocxReader;
13pub use passthrough::PassthroughReader;
14pub use pdf::PdfReader;
15pub use pptx::PptxReader;
16pub use xls::XlsReader;
17pub use xlsx::XlsxReader;
18
19use crate::{ExtractedDocument, Result};
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
23pub enum DocumentFormat {
24 Pdf,
25 Docx,
26 Xlsx,
27 Xls,
28 Pptx,
29 PlainText,
30 Markdown,
31 Html,
32 Jsonl,
33 Unknown,
34}
35
36impl DocumentFormat {
37 #[must_use]
38 pub fn label(self) -> &'static str {
39 match self {
40 Self::Pdf => "pdf",
41 Self::Docx => "docx",
42 Self::Xlsx => "xlsx",
43 Self::Xls => "xls",
44 Self::Pptx => "pptx",
45 Self::PlainText => "text",
46 Self::Markdown => "markdown",
47 Self::Html => "html",
48 Self::Jsonl => "jsonl",
49 Self::Unknown => "unknown",
50 }
51 }
52}
53
54#[derive(Debug, Clone)]
56pub struct ReaderHint<'a> {
57 pub mime: Option<&'a str>,
58 pub format: Option<DocumentFormat>,
59 pub uri: Option<&'a str>,
60 pub magic_bytes: Option<&'a [u8]>,
61}
62
63impl<'a> ReaderHint<'a> {
64 #[must_use]
65 pub fn new(mime: Option<&'a str>, format: Option<DocumentFormat>) -> Self {
66 Self {
67 mime,
68 format,
69 uri: None,
70 magic_bytes: None,
71 }
72 }
73
74 #[must_use]
75 pub fn with_uri(mut self, uri: Option<&'a str>) -> Self {
76 self.uri = uri;
77 self
78 }
79
80 #[must_use]
81 pub fn with_magic(mut self, magic: Option<&'a [u8]>) -> Self {
82 self.magic_bytes = magic;
83 self
84 }
85}
86
87#[derive(Debug, Clone)]
89pub struct ReaderOutput {
90 pub document: ExtractedDocument,
91 pub reader_name: String,
92 pub diagnostics: ReaderDiagnostics,
93}
94
95impl ReaderOutput {
96 #[must_use]
97 pub fn new(document: ExtractedDocument, reader_name: impl Into<String>) -> Self {
98 Self {
99 document,
100 reader_name: reader_name.into(),
101 diagnostics: ReaderDiagnostics::default(),
102 }
103 }
104
105 #[must_use]
106 pub fn with_diagnostics(mut self, diagnostics: ReaderDiagnostics) -> Self {
107 self.diagnostics = diagnostics;
108 self
109 }
110}
111
112#[derive(Debug, Clone, Default)]
114pub struct ReaderDiagnostics {
115 pub warnings: Vec<String>,
116 pub fallback: bool,
117 pub extra_metadata: Value,
118 pub duration_ms: Option<u64>,
119 pub pages_processed: Option<u32>,
120}
121
122impl ReaderDiagnostics {
123 pub fn record_warning<S: Into<String>>(&mut self, warning: S) {
124 self.warnings.push(warning.into());
125 }
126
127 pub fn mark_fallback(&mut self) {
128 self.fallback = true;
129 }
130
131 #[must_use]
132 pub fn with_metadata(mut self, value: Value) -> Self {
133 self.extra_metadata = value;
134 self
135 }
136
137 pub fn merge_from(&mut self, other: &ReaderDiagnostics) {
138 self.warnings.extend(other.warnings.iter().cloned());
139 if other.fallback {
140 self.fallback = true;
141 }
142 if !other.extra_metadata.is_null() {
143 self.extra_metadata = other.extra_metadata.clone();
144 }
145 if other.duration_ms.is_some() {
146 self.duration_ms = other.duration_ms;
147 }
148 if other.pages_processed.is_some() {
149 self.pages_processed = other.pages_processed;
150 }
151 }
152
153 pub fn track_warning<S: Into<String>>(&mut self, warning: S) {
154 self.warnings.push(warning.into());
155 self.fallback = true;
156 }
157}
158
159pub trait DocumentReader: Send + Sync {
161 fn name(&self) -> &'static str;
163
164 fn supports(&self, hint: &ReaderHint<'_>) -> bool;
166
167 fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput>;
169}
170
171pub struct ReaderRegistry {
173 readers: Vec<Box<dyn DocumentReader>>,
174}
175
176impl ReaderRegistry {
177 #[must_use]
178 pub fn new() -> Self {
179 Self {
180 readers: Vec::new(),
181 }
182 }
183
184 pub fn register<R>(&mut self, reader: R)
185 where
186 R: DocumentReader + 'static,
187 {
188 self.readers.push(Box::new(reader));
189 }
190
191 #[must_use]
192 pub fn readers(&self) -> &[Box<dyn DocumentReader>] {
193 &self.readers
194 }
195
196 pub fn find_reader<'a>(&'a self, hint: &ReaderHint<'_>) -> Option<&'a dyn DocumentReader> {
197 self.readers
198 .iter()
199 .map(std::convert::AsRef::as_ref)
200 .find(|reader| reader.supports(hint))
201 }
202}
203
204impl Default for ReaderRegistry {
205 fn default() -> Self {
206 let mut registry = Self::new();
207 registry.register(PdfReader);
208 registry.register(DocxReader);
209 registry.register(XlsxReader);
210 registry.register(XlsReader);
211 registry.register(PptxReader);
212 registry.register(PassthroughReader);
213 registry
214 }
215}