memvid_core/reader/
mod.rs1mod docx;
4mod passthrough;
5mod pdf;
6mod pptx;
7mod xlsx;
8
9use serde_json::Value;
10
11pub use docx::DocxReader;
12pub use passthrough::PassthroughReader;
13pub use pdf::PdfReader;
14pub use pptx::PptxReader;
15pub use xlsx::XlsxReader;
16
17use crate::{ExtractedDocument, Result};
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
21pub enum DocumentFormat {
22 Pdf,
23 Docx,
24 Xlsx,
25 Pptx,
26 PlainText,
27 Markdown,
28 Html,
29 Unknown,
30}
31
32impl DocumentFormat {
33 pub fn label(self) -> &'static str {
34 match self {
35 Self::Pdf => "pdf",
36 Self::Docx => "docx",
37 Self::Xlsx => "xlsx",
38 Self::Pptx => "pptx",
39 Self::PlainText => "text",
40 Self::Markdown => "markdown",
41 Self::Html => "html",
42 Self::Unknown => "unknown",
43 }
44 }
45}
46
47#[derive(Debug, Clone)]
49pub struct ReaderHint<'a> {
50 pub mime: Option<&'a str>,
51 pub format: Option<DocumentFormat>,
52 pub uri: Option<&'a str>,
53 pub magic_bytes: Option<&'a [u8]>,
54}
55
56impl<'a> ReaderHint<'a> {
57 #[must_use]
58 pub fn new(mime: Option<&'a str>, format: Option<DocumentFormat>) -> Self {
59 Self {
60 mime,
61 format,
62 uri: None,
63 magic_bytes: None,
64 }
65 }
66
67 #[must_use]
68 pub fn with_uri(mut self, uri: Option<&'a str>) -> Self {
69 self.uri = uri;
70 self
71 }
72
73 #[must_use]
74 pub fn with_magic(mut self, magic: Option<&'a [u8]>) -> Self {
75 self.magic_bytes = magic;
76 self
77 }
78}
79
80#[derive(Debug, Clone)]
82pub struct ReaderOutput {
83 pub document: ExtractedDocument,
84 pub reader_name: String,
85 pub diagnostics: ReaderDiagnostics,
86}
87
88impl ReaderOutput {
89 #[must_use]
90 pub fn new(document: ExtractedDocument, reader_name: impl Into<String>) -> Self {
91 Self {
92 document,
93 reader_name: reader_name.into(),
94 diagnostics: ReaderDiagnostics::default(),
95 }
96 }
97
98 #[must_use]
99 pub fn with_diagnostics(mut self, diagnostics: ReaderDiagnostics) -> Self {
100 self.diagnostics = diagnostics;
101 self
102 }
103}
104
105#[derive(Debug, Clone, Default)]
107pub struct ReaderDiagnostics {
108 pub warnings: Vec<String>,
109 pub fallback: bool,
110 pub extra_metadata: Value,
111 pub duration_ms: Option<u64>,
112 pub pages_processed: Option<u32>,
113}
114
115impl ReaderDiagnostics {
116 pub fn record_warning<S: Into<String>>(&mut self, warning: S) {
117 self.warnings.push(warning.into());
118 }
119
120 pub fn mark_fallback(&mut self) {
121 self.fallback = true;
122 }
123
124 pub fn with_metadata(mut self, value: Value) -> Self {
125 self.extra_metadata = value;
126 self
127 }
128
129 pub fn merge_from(&mut self, other: &ReaderDiagnostics) {
130 self.warnings.extend(other.warnings.iter().cloned());
131 if other.fallback {
132 self.fallback = true;
133 }
134 if !other.extra_metadata.is_null() {
135 self.extra_metadata = other.extra_metadata.clone();
136 }
137 if other.duration_ms.is_some() {
138 self.duration_ms = other.duration_ms;
139 }
140 if other.pages_processed.is_some() {
141 self.pages_processed = other.pages_processed;
142 }
143 }
144
145 pub fn track_warning<S: Into<String>>(&mut self, warning: S) {
146 self.warnings.push(warning.into());
147 self.fallback = true;
148 }
149}
150
151pub trait DocumentReader: Send + Sync {
153 fn name(&self) -> &'static str;
155
156 fn supports(&self, hint: &ReaderHint<'_>) -> bool;
158
159 fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput>;
161}
162
163pub struct ReaderRegistry {
165 readers: Vec<Box<dyn DocumentReader>>,
166}
167
168impl ReaderRegistry {
169 #[must_use]
170 pub fn new() -> Self {
171 Self {
172 readers: Vec::new(),
173 }
174 }
175
176 pub fn register<R>(&mut self, reader: R)
177 where
178 R: DocumentReader + 'static,
179 {
180 self.readers.push(Box::new(reader));
181 }
182
183 #[must_use]
184 pub fn readers(&self) -> &[Box<dyn DocumentReader>] {
185 &self.readers
186 }
187
188 pub fn find_reader<'a>(&'a self, hint: &ReaderHint<'_>) -> Option<&'a dyn DocumentReader> {
189 self.readers
190 .iter()
191 .map(std::convert::AsRef::as_ref)
192 .find(|reader| reader.supports(hint))
193 }
194}
195
196impl Default for ReaderRegistry {
197 fn default() -> Self {
198 let mut registry = Self::new();
199 registry.register(PdfReader);
200 registry.register(DocxReader);
201 registry.register(XlsxReader);
202 registry.register(PptxReader);
203 registry.register(PassthroughReader);
204 registry
205 }
206}