memvid_core/reader/
mod.rs1mod docx;
4mod passthrough;
5mod pdf;
6mod pptx;
7mod xls;
8mod xlsx;
9pub(crate) mod xlsx_chunker;
10pub(crate) mod xlsx_ooxml;
11pub(crate) mod xlsx_table_detect;
12
13use serde_json::Value;
14
15pub use docx::DocxReader;
16pub use passthrough::PassthroughReader;
17pub use pdf::PdfReader;
18pub use pptx::PptxReader;
19pub use xls::XlsReader;
20pub use xlsx::{XlsxReader, XlsxStructuredDiagnostics, XlsxStructuredResult};
21pub use xlsx_chunker::XlsxChunkingOptions;
22pub use xlsx_table_detect::DetectedTable;
23
24use crate::{ExtractedDocument, Result};
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
28pub enum DocumentFormat {
29 Pdf,
30 Docx,
31 Xlsx,
32 Xls,
33 Pptx,
34 PlainText,
35 Markdown,
36 Html,
37 Jsonl,
38 Unknown,
39}
40
41impl DocumentFormat {
42 #[must_use]
43 pub fn label(self) -> &'static str {
44 match self {
45 Self::Pdf => "pdf",
46 Self::Docx => "docx",
47 Self::Xlsx => "xlsx",
48 Self::Xls => "xls",
49 Self::Pptx => "pptx",
50 Self::PlainText => "text",
51 Self::Markdown => "markdown",
52 Self::Html => "html",
53 Self::Jsonl => "jsonl",
54 Self::Unknown => "unknown",
55 }
56 }
57}
58
59#[derive(Debug, Clone)]
61pub struct ReaderHint<'a> {
62 pub mime: Option<&'a str>,
63 pub format: Option<DocumentFormat>,
64 pub uri: Option<&'a str>,
65 pub magic_bytes: Option<&'a [u8]>,
66}
67
68impl<'a> ReaderHint<'a> {
69 #[must_use]
70 pub fn new(mime: Option<&'a str>, format: Option<DocumentFormat>) -> Self {
71 Self {
72 mime,
73 format,
74 uri: None,
75 magic_bytes: None,
76 }
77 }
78
79 #[must_use]
80 pub fn with_uri(mut self, uri: Option<&'a str>) -> Self {
81 self.uri = uri;
82 self
83 }
84
85 #[must_use]
86 pub fn with_magic(mut self, magic: Option<&'a [u8]>) -> Self {
87 self.magic_bytes = magic;
88 self
89 }
90}
91
92#[derive(Debug, Clone)]
94pub struct ReaderOutput {
95 pub document: ExtractedDocument,
96 pub reader_name: String,
97 pub diagnostics: ReaderDiagnostics,
98}
99
100impl ReaderOutput {
101 #[must_use]
102 pub fn new(document: ExtractedDocument, reader_name: impl Into<String>) -> Self {
103 Self {
104 document,
105 reader_name: reader_name.into(),
106 diagnostics: ReaderDiagnostics::default(),
107 }
108 }
109
110 #[must_use]
111 pub fn with_diagnostics(mut self, diagnostics: ReaderDiagnostics) -> Self {
112 self.diagnostics = diagnostics;
113 self
114 }
115}
116
117#[derive(Debug, Clone, Default)]
119pub struct ReaderDiagnostics {
120 pub warnings: Vec<String>,
121 pub fallback: bool,
122 pub extra_metadata: Value,
123 pub duration_ms: Option<u64>,
124 pub pages_processed: Option<u32>,
125}
126
127impl ReaderDiagnostics {
128 pub fn record_warning<S: Into<String>>(&mut self, warning: S) {
129 self.warnings.push(warning.into());
130 }
131
132 pub fn mark_fallback(&mut self) {
133 self.fallback = true;
134 }
135
136 #[must_use]
137 pub fn with_metadata(mut self, value: Value) -> Self {
138 self.extra_metadata = value;
139 self
140 }
141
142 pub fn merge_from(&mut self, other: &ReaderDiagnostics) {
143 self.warnings.extend(other.warnings.iter().cloned());
144 if other.fallback {
145 self.fallback = true;
146 }
147 if !other.extra_metadata.is_null() {
148 self.extra_metadata = other.extra_metadata.clone();
149 }
150 if other.duration_ms.is_some() {
151 self.duration_ms = other.duration_ms;
152 }
153 if other.pages_processed.is_some() {
154 self.pages_processed = other.pages_processed;
155 }
156 }
157
158 pub fn track_warning<S: Into<String>>(&mut self, warning: S) {
159 self.warnings.push(warning.into());
160 self.fallback = true;
161 }
162}
163
164pub trait DocumentReader: Send + Sync {
166 fn name(&self) -> &'static str;
168
169 fn supports(&self, hint: &ReaderHint<'_>) -> bool;
171
172 fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput>;
174}
175
176pub struct ReaderRegistry {
178 readers: Vec<Box<dyn DocumentReader>>,
179}
180
181impl ReaderRegistry {
182 #[must_use]
183 pub fn new() -> Self {
184 Self {
185 readers: Vec::new(),
186 }
187 }
188
189 pub fn register<R>(&mut self, reader: R)
190 where
191 R: DocumentReader + 'static,
192 {
193 self.readers.push(Box::new(reader));
194 }
195
196 #[must_use]
197 pub fn readers(&self) -> &[Box<dyn DocumentReader>] {
198 &self.readers
199 }
200
201 pub fn find_reader<'a>(&'a self, hint: &ReaderHint<'_>) -> Option<&'a dyn DocumentReader> {
202 self.readers
203 .iter()
204 .map(std::convert::AsRef::as_ref)
205 .find(|reader| reader.supports(hint))
206 }
207}
208
209impl Default for ReaderRegistry {
210 fn default() -> Self {
211 let mut registry = Self::new();
212 registry.register(PdfReader);
213 registry.register(DocxReader);
214 registry.register(XlsxReader);
215 registry.register(XlsReader);
216 registry.register(PptxReader);
217 registry.register(PassthroughReader);
218 registry
219 }
220}