memvid_core/reader/
mod.rs1mod docx;
4mod passthrough;
5mod pdf;
6mod pptx;
7mod xls;
8mod xlsx;
9
10use serde_json::Value;
11
12pub use docx::DocxReader;
13pub use passthrough::PassthroughReader;
14pub use pdf::PdfReader;
15pub use pptx::PptxReader;
16pub use xls::XlsReader;
17pub use xlsx::XlsxReader;
18
19use crate::{ExtractedDocument, Result};
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
23pub enum DocumentFormat {
24 Pdf,
25 Docx,
26 Xlsx,
27 Xls,
28 Pptx,
29 PlainText,
30 Markdown,
31 Html,
32 Unknown,
33}
34
35impl DocumentFormat {
36 pub fn label(self) -> &'static str {
37 match self {
38 Self::Pdf => "pdf",
39 Self::Docx => "docx",
40 Self::Xlsx => "xlsx",
41 Self::Xls => "xls",
42 Self::Pptx => "pptx",
43 Self::PlainText => "text",
44 Self::Markdown => "markdown",
45 Self::Html => "html",
46 Self::Unknown => "unknown",
47 }
48 }
49}
50
51#[derive(Debug, Clone)]
53pub struct ReaderHint<'a> {
54 pub mime: Option<&'a str>,
55 pub format: Option<DocumentFormat>,
56 pub uri: Option<&'a str>,
57 pub magic_bytes: Option<&'a [u8]>,
58}
59
60impl<'a> ReaderHint<'a> {
61 #[must_use]
62 pub fn new(mime: Option<&'a str>, format: Option<DocumentFormat>) -> Self {
63 Self {
64 mime,
65 format,
66 uri: None,
67 magic_bytes: None,
68 }
69 }
70
71 #[must_use]
72 pub fn with_uri(mut self, uri: Option<&'a str>) -> Self {
73 self.uri = uri;
74 self
75 }
76
77 #[must_use]
78 pub fn with_magic(mut self, magic: Option<&'a [u8]>) -> Self {
79 self.magic_bytes = magic;
80 self
81 }
82}
83
84#[derive(Debug, Clone)]
86pub struct ReaderOutput {
87 pub document: ExtractedDocument,
88 pub reader_name: String,
89 pub diagnostics: ReaderDiagnostics,
90}
91
92impl ReaderOutput {
93 #[must_use]
94 pub fn new(document: ExtractedDocument, reader_name: impl Into<String>) -> Self {
95 Self {
96 document,
97 reader_name: reader_name.into(),
98 diagnostics: ReaderDiagnostics::default(),
99 }
100 }
101
102 #[must_use]
103 pub fn with_diagnostics(mut self, diagnostics: ReaderDiagnostics) -> Self {
104 self.diagnostics = diagnostics;
105 self
106 }
107}
108
109#[derive(Debug, Clone, Default)]
111pub struct ReaderDiagnostics {
112 pub warnings: Vec<String>,
113 pub fallback: bool,
114 pub extra_metadata: Value,
115 pub duration_ms: Option<u64>,
116 pub pages_processed: Option<u32>,
117}
118
119impl ReaderDiagnostics {
120 pub fn record_warning<S: Into<String>>(&mut self, warning: S) {
121 self.warnings.push(warning.into());
122 }
123
124 pub fn mark_fallback(&mut self) {
125 self.fallback = true;
126 }
127
128 pub fn with_metadata(mut self, value: Value) -> Self {
129 self.extra_metadata = value;
130 self
131 }
132
133 pub fn merge_from(&mut self, other: &ReaderDiagnostics) {
134 self.warnings.extend(other.warnings.iter().cloned());
135 if other.fallback {
136 self.fallback = true;
137 }
138 if !other.extra_metadata.is_null() {
139 self.extra_metadata = other.extra_metadata.clone();
140 }
141 if other.duration_ms.is_some() {
142 self.duration_ms = other.duration_ms;
143 }
144 if other.pages_processed.is_some() {
145 self.pages_processed = other.pages_processed;
146 }
147 }
148
149 pub fn track_warning<S: Into<String>>(&mut self, warning: S) {
150 self.warnings.push(warning.into());
151 self.fallback = true;
152 }
153}
154
155pub trait DocumentReader: Send + Sync {
157 fn name(&self) -> &'static str;
159
160 fn supports(&self, hint: &ReaderHint<'_>) -> bool;
162
163 fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput>;
165}
166
167pub struct ReaderRegistry {
169 readers: Vec<Box<dyn DocumentReader>>,
170}
171
172impl ReaderRegistry {
173 #[must_use]
174 pub fn new() -> Self {
175 Self {
176 readers: Vec::new(),
177 }
178 }
179
180 pub fn register<R>(&mut self, reader: R)
181 where
182 R: DocumentReader + 'static,
183 {
184 self.readers.push(Box::new(reader));
185 }
186
187 #[must_use]
188 pub fn readers(&self) -> &[Box<dyn DocumentReader>] {
189 &self.readers
190 }
191
192 pub fn find_reader<'a>(&'a self, hint: &ReaderHint<'_>) -> Option<&'a dyn DocumentReader> {
193 self.readers
194 .iter()
195 .map(std::convert::AsRef::as_ref)
196 .find(|reader| reader.supports(hint))
197 }
198}
199
200impl Default for ReaderRegistry {
201 fn default() -> Self {
202 let mut registry = Self::new();
203 registry.register(PdfReader);
204 registry.register(DocxReader);
205 registry.register(XlsxReader);
206 registry.register(XlsReader);
207 registry.register(PptxReader);
208 registry.register(PassthroughReader);
209 registry
210 }
211}