1use std::path::Path;
2
3use anyhow::{Context, Result};
4use calamine::Reader;
5
6#[derive(Debug, Clone)]
7pub struct ExtractedDoc {
8 pub file: String,
9 pub title: Option<String>,
10 pub content_hash: String,
11 pub format: DocFormat,
12 pub text: String,
13 pub page_count: Option<usize>,
14}
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum DocFormat {
18 Markdown,
19 PlainText,
20 Rst,
21 Asciidoc,
22 Org,
23 Pdf,
24 Docx,
25 Pptx,
26 Xlsx,
27 Html,
28 Rtf,
29 Xml,
30}
31
32impl DocFormat {
33 pub fn as_str(&self) -> &'static str {
34 match self {
35 Self::Markdown => "markdown",
36 Self::PlainText => "text",
37 Self::Rst => "rst",
38 Self::Asciidoc => "asciidoc",
39 Self::Org => "org",
40 Self::Pdf => "pdf",
41 Self::Docx => "docx",
42 Self::Pptx => "pptx",
43 Self::Xlsx => "xlsx",
44 Self::Html => "html",
45 Self::Rtf => "rtf",
46 Self::Xml => "xml",
47 }
48 }
49}
50
51pub fn extract_document(path: &Path, bytes: &[u8], ext: &str) -> Result<ExtractedDoc> {
52 let format = match ext {
53 "md" | "markdown" => DocFormat::Markdown,
54 "txt" => DocFormat::PlainText,
55 "rst" => DocFormat::Rst,
56 "adoc" => DocFormat::Asciidoc,
57 "org" => DocFormat::Org,
58 "pdf" => DocFormat::Pdf,
59 "docx" => DocFormat::Docx,
60 "pptx" => DocFormat::Pptx,
61 "xlsx" => DocFormat::Xlsx,
62 "html" | "htm" => DocFormat::Html,
63 "rtf" => DocFormat::Rtf,
64 "xml" | "xsl" | "xsd" | "svg" | "plist" => DocFormat::Xml,
65 _ => anyhow::bail!("unsupported document format: {ext}"),
66 };
67
68 let (text, title, page_count) = match format {
69 DocFormat::Markdown
70 | DocFormat::PlainText
71 | DocFormat::Rst
72 | DocFormat::Asciidoc
73 | DocFormat::Org => extract_text(bytes)?,
74 DocFormat::Pdf => extract_pdf(path, bytes)?,
75 DocFormat::Docx => extract_docx(bytes)?,
76 DocFormat::Pptx => extract_pptx(bytes)?,
77 DocFormat::Xlsx => extract_xlsx(bytes)?,
78 DocFormat::Html => extract_html(bytes)?,
79 DocFormat::Rtf => extract_rtf(bytes)?,
80 DocFormat::Xml => extract_xml(bytes)?,
81 };
82
83 Ok(ExtractedDoc {
84 file: String::new(),
85 title,
86 content_hash: String::new(),
87 format,
88 text,
89 page_count,
90 })
91}
92
93fn extract_text(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
94 let text = String::from_utf8_lossy(bytes).into_owned();
95 let title = text
96 .lines()
97 .next()
98 .map(|l| l.trim_start_matches('#').trim().to_string())
99 .filter(|t| !t.is_empty());
100 Ok((text, title, None))
101}
102
103fn extract_pdf(path: &Path, bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
104 match pdf_oxide::PdfDocument::from_bytes(bytes.to_vec()) {
105 Ok(doc) => {
106 let page_count = doc.page_count().unwrap_or(0);
107 let mut pages_text = Vec::new();
108 for i in 0..page_count {
109 match doc.extract_text(i) {
110 Ok(text) => pages_text.push(text),
111 Err(e) => {
112 eprintln!(
113 "warning: PDF page {} extraction failed in {}: {e}",
114 i + 1,
115 path.display()
116 );
117 }
118 }
119 }
120 let text = pages_text.join("\n");
121 let title = text
122 .lines()
123 .next()
124 .map(|l| l.trim().to_string())
125 .filter(|t| !t.is_empty());
126 let count = if page_count > 0 {
127 Some(page_count)
128 } else {
129 None
130 };
131 Ok((text, title, count))
132 }
133 Err(_) => {
134 let text = String::from_utf8_lossy(bytes);
135 if text.is_ascii() && text.len() > 10 {
136 let title = text
137 .lines()
138 .next()
139 .map(|l| l.trim().to_string())
140 .filter(|t| !t.is_empty());
141 Ok((text.into_owned(), title, None))
142 } else {
143 anyhow::bail!("PDF extraction failed: {}", path.display())
144 }
145 }
146 }
147}
148
149fn extract_docx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
150 let cursor = std::io::Cursor::new(bytes);
151 let mut archive = zip::ZipArchive::new(cursor).context("DOCX is not a valid ZIP archive")?;
152
153 let mut text = String::new();
154 let mut title = None;
155
156 if let Ok(mut file) = archive.by_name("word/document.xml") {
157 let mut xml = String::new();
158 std::io::Read::read_to_string(&mut file, &mut xml)?;
159 text = extract_text_from_ooxml(&xml);
160 title = text
161 .lines()
162 .next()
163 .map(|l| l.trim().to_string())
164 .filter(|t| !t.is_empty());
165 }
166
167 Ok((text, title, None))
168}
169
170fn extract_pptx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
171 let cursor = std::io::Cursor::new(bytes);
172 let mut archive = zip::ZipArchive::new(cursor).context("PPTX is not a valid ZIP archive")?;
173
174 let mut all_text = Vec::new();
175 let mut slide_names: Vec<String> = Vec::new();
176
177 for i in 0..archive.len() {
178 let file = archive.by_index(i)?;
179 let name = file.name().to_string();
180 if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
181 slide_names.push(name);
182 }
183 }
184 slide_names.sort();
185
186 let page_count = Some(slide_names.len());
187
188 for name in &slide_names {
189 if let Ok(mut file) = archive.by_name(name) {
190 let mut xml = String::new();
191 std::io::Read::read_to_string(&mut file, &mut xml)?;
192 let slide_text = extract_text_from_ooxml(&xml);
193 if !slide_text.is_empty() {
194 all_text.push(slide_text);
195 }
196 }
197 }
198
199 let text = all_text.join("\n\n");
200 let title = text
201 .lines()
202 .next()
203 .map(|l| l.trim().to_string())
204 .filter(|t| !t.is_empty());
205 Ok((text, title, page_count))
206}
207
208fn extract_xlsx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
209 let cursor = std::io::Cursor::new(bytes);
210 let mut workbook =
211 calamine::open_workbook_auto_from_rs(cursor).context("Failed to open spreadsheet")?;
212
213 let mut all_text = Vec::new();
214 let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
215 let page_count = Some(sheet_names.len());
216
217 for name in &sheet_names {
218 if let Ok(range) = workbook.worksheet_range(name) {
219 let mut sheet_text = format!("Sheet: {}\n", name);
220 for row in range.rows() {
221 let cells: Vec<String> = row.iter().map(|cell| format!("{}", cell)).collect();
222 let line = cells.join("\t");
223 if !line.trim().is_empty() {
224 sheet_text.push_str(&line);
225 sheet_text.push('\n');
226 }
227 }
228 all_text.push(sheet_text);
229 }
230 }
231
232 let text = all_text.join("\n");
233 let title = sheet_names.first().cloned();
234 Ok((text, title, page_count))
235}
236
237fn extract_html(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
238 let html = String::from_utf8_lossy(bytes);
239 let mut text = String::new();
240 let mut in_tag = false;
241 let mut title = None;
242
243 if let Some(start) = html.find("<title>") {
245 if let Some(end) = html[start..].find("</title>") {
246 title = Some(html[start + 7..start + end].trim().to_string());
247 }
248 }
249
250 for ch in html.chars() {
252 match ch {
253 '<' => in_tag = true,
254 '>' => {
255 in_tag = false;
256 if !text.ends_with('\n') && !text.ends_with(' ') {
257 text.push(' ');
258 }
259 }
260 _ if !in_tag => text.push(ch),
261 _ => {}
262 }
263 }
264
265 let text = regex::Regex::new(r"\s+")
267 .unwrap()
268 .replace_all(text.trim(), " ")
269 .into_owned();
270
271 Ok((text, title, None))
272}
273
274fn extract_rtf(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
275 let rtf = String::from_utf8_lossy(bytes);
276 let mut text = String::new();
277 let mut in_control = false;
278 let mut brace_depth = 0i32;
279
280 for ch in rtf.chars() {
281 match ch {
282 '{' => brace_depth += 1,
283 '}' => brace_depth -= 1,
284 '\\' => in_control = true,
285 ' ' | '\n' if in_control => {
286 in_control = false;
287 if brace_depth <= 2 {
288 text.push(' ');
289 }
290 }
291 _ if in_control => {}
292 _ if brace_depth <= 2 => text.push(ch),
293 _ => {}
294 }
295 }
296
297 let text = text.trim().to_string();
298 let title = text
299 .lines()
300 .next()
301 .map(|l| l.trim().to_string())
302 .filter(|t| !t.is_empty());
303 Ok((text, title, None))
304}
305
306fn extract_xml(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
307 let xml_str = String::from_utf8_lossy(bytes);
308 let mut text = String::new();
309 let mut title = None;
310 let mut reader = quick_xml::Reader::from_str(&xml_str);
311 let mut buf = Vec::new();
312 let mut depth = 0u32;
313
314 loop {
315 match reader.read_event_into(&mut buf) {
316 Ok(quick_xml::events::Event::Start(ref e)) => {
317 depth += 1;
318 if depth > 1 && !text.is_empty() && !text.ends_with('\n') {
319 text.push('\n');
320 }
321 if depth == 1 && title.is_none() {
323 let local = e.local_name();
324 let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
325 if !name.is_empty() {
326 title = Some(name.to_string());
327 }
328 }
329 }
330 Ok(quick_xml::events::Event::Text(ref e)) => {
331 if let Ok(t) = e.unescape() {
332 let trimmed = t.trim();
333 if !trimmed.is_empty() {
334 if !text.is_empty() && !text.ends_with('\n') && !text.ends_with(' ') {
335 text.push(' ');
336 }
337 text.push_str(trimmed);
338 }
339 }
340 }
341 Ok(quick_xml::events::Event::End(_)) => {
342 depth = depth.saturating_sub(1);
343 }
344 Ok(quick_xml::events::Event::Eof) => break,
345 Err(_) => break,
346 _ => {}
347 }
348 buf.clear();
349 }
350
351 let text = text.trim().to_string();
352 Ok((text, title, None))
353}
354
355fn extract_text_from_ooxml(xml: &str) -> String {
356 let mut text = String::new();
357 let mut reader = quick_xml::Reader::from_str(xml);
358 let mut in_text = false;
359 let mut buf = Vec::new();
360
361 loop {
362 match reader.read_event_into(&mut buf) {
363 Ok(quick_xml::events::Event::Start(ref e))
364 | Ok(quick_xml::events::Event::Empty(ref e)) => {
365 let local = e.local_name();
366 let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
367 if name == "t" {
368 in_text = true;
369 }
370 if name == "p" && !text.is_empty() && !text.ends_with('\n') {
372 text.push('\n');
373 }
374 }
375 Ok(quick_xml::events::Event::Text(ref e)) if in_text => {
376 if let Ok(t) = e.unescape() {
377 text.push_str(&t);
378 }
379 }
380 Ok(quick_xml::events::Event::End(ref e)) => {
381 let local = e.local_name();
382 let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
383 if name == "t" {
384 in_text = false;
385 }
386 }
387 Ok(quick_xml::events::Event::Eof) => break,
388 Err(_) => break,
389 _ => {}
390 }
391 buf.clear();
392 }
393
394 text
395}