memvid_core/reader/
mod.rs1mod docx;
4mod passthrough;
5mod pdf;
6mod pptx;
7mod xls;
8mod xlsx;
9
10use serde_json::Value;
11
12pub use docx::DocxReader;
13pub use passthrough::PassthroughReader;
14pub use pdf::PdfReader;
15pub use pptx::PptxReader;
16pub use xls::XlsReader;
17pub use xlsx::XlsxReader;
18
19use crate::{ExtractedDocument, Result};
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
23pub enum DocumentFormat {
24 Pdf,
25 Docx,
26 Xlsx,
27 Xls,
28 Pptx,
29 PlainText,
30 Markdown,
31 Html,
32 Unknown,
33}
34
35impl DocumentFormat {
36 #[must_use]
37 pub fn label(self) -> &'static str {
38 match self {
39 Self::Pdf => "pdf",
40 Self::Docx => "docx",
41 Self::Xlsx => "xlsx",
42 Self::Xls => "xls",
43 Self::Pptx => "pptx",
44 Self::PlainText => "text",
45 Self::Markdown => "markdown",
46 Self::Html => "html",
47 Self::Unknown => "unknown",
48 }
49 }
50}
51
52#[derive(Debug, Clone)]
54pub struct ReaderHint<'a> {
55 pub mime: Option<&'a str>,
56 pub format: Option<DocumentFormat>,
57 pub uri: Option<&'a str>,
58 pub magic_bytes: Option<&'a [u8]>,
59}
60
61impl<'a> ReaderHint<'a> {
62 #[must_use]
63 pub fn new(mime: Option<&'a str>, format: Option<DocumentFormat>) -> Self {
64 Self {
65 mime,
66 format,
67 uri: None,
68 magic_bytes: None,
69 }
70 }
71
72 #[must_use]
73 pub fn with_uri(mut self, uri: Option<&'a str>) -> Self {
74 self.uri = uri;
75 self
76 }
77
78 #[must_use]
79 pub fn with_magic(mut self, magic: Option<&'a [u8]>) -> Self {
80 self.magic_bytes = magic;
81 self
82 }
83}
84
85#[derive(Debug, Clone)]
87pub struct ReaderOutput {
88 pub document: ExtractedDocument,
89 pub reader_name: String,
90 pub diagnostics: ReaderDiagnostics,
91}
92
93impl ReaderOutput {
94 #[must_use]
95 pub fn new(document: ExtractedDocument, reader_name: impl Into<String>) -> Self {
96 Self {
97 document,
98 reader_name: reader_name.into(),
99 diagnostics: ReaderDiagnostics::default(),
100 }
101 }
102
103 #[must_use]
104 pub fn with_diagnostics(mut self, diagnostics: ReaderDiagnostics) -> Self {
105 self.diagnostics = diagnostics;
106 self
107 }
108}
109
110#[derive(Debug, Clone, Default)]
112pub struct ReaderDiagnostics {
113 pub warnings: Vec<String>,
114 pub fallback: bool,
115 pub extra_metadata: Value,
116 pub duration_ms: Option<u64>,
117 pub pages_processed: Option<u32>,
118}
119
120impl ReaderDiagnostics {
121 pub fn record_warning<S: Into<String>>(&mut self, warning: S) {
122 self.warnings.push(warning.into());
123 }
124
125 pub fn mark_fallback(&mut self) {
126 self.fallback = true;
127 }
128
129 #[must_use]
130 pub fn with_metadata(mut self, value: Value) -> Self {
131 self.extra_metadata = value;
132 self
133 }
134
135 pub fn merge_from(&mut self, other: &ReaderDiagnostics) {
136 self.warnings.extend(other.warnings.iter().cloned());
137 if other.fallback {
138 self.fallback = true;
139 }
140 if !other.extra_metadata.is_null() {
141 self.extra_metadata = other.extra_metadata.clone();
142 }
143 if other.duration_ms.is_some() {
144 self.duration_ms = other.duration_ms;
145 }
146 if other.pages_processed.is_some() {
147 self.pages_processed = other.pages_processed;
148 }
149 }
150
151 pub fn track_warning<S: Into<String>>(&mut self, warning: S) {
152 self.warnings.push(warning.into());
153 self.fallback = true;
154 }
155}
156
157pub trait DocumentReader: Send + Sync {
159 fn name(&self) -> &'static str;
161
162 fn supports(&self, hint: &ReaderHint<'_>) -> bool;
164
165 fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput>;
167}
168
169pub struct ReaderRegistry {
171 readers: Vec<Box<dyn DocumentReader>>,
172}
173
174impl ReaderRegistry {
175 #[must_use]
176 pub fn new() -> Self {
177 Self {
178 readers: Vec::new(),
179 }
180 }
181
182 pub fn register<R>(&mut self, reader: R)
183 where
184 R: DocumentReader + 'static,
185 {
186 self.readers.push(Box::new(reader));
187 }
188
189 #[must_use]
190 pub fn readers(&self) -> &[Box<dyn DocumentReader>] {
191 &self.readers
192 }
193
194 pub fn find_reader<'a>(&'a self, hint: &ReaderHint<'_>) -> Option<&'a dyn DocumentReader> {
195 self.readers
196 .iter()
197 .map(std::convert::AsRef::as_ref)
198 .find(|reader| reader.supports(hint))
199 }
200}
201
202impl Default for ReaderRegistry {
203 fn default() -> Self {
204 let mut registry = Self::new();
205 registry.register(PdfReader);
206 registry.register(DocxReader);
207 registry.register(XlsxReader);
208 registry.register(XlsReader);
209 registry.register(PptxReader);
210 registry.register(PassthroughReader);
211 registry
212 }
213}