1use std::path::Path;
2
3use anyhow::{Context, Result};
4use calamine::Reader;
5
6#[derive(Debug, Clone)]
7pub struct ExtractedDoc {
8 pub file: String,
9 pub title: Option<String>,
10 pub content_hash: String,
11 pub format: DocFormat,
12 pub text: String,
13 pub page_count: Option<usize>,
14}
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum DocFormat {
18 Markdown,
19 PlainText,
20 Rst,
21 Asciidoc,
22 Org,
23 Pdf,
24 Docx,
25 Pptx,
26 Xlsx,
27 Html,
28 Rtf,
29 Xml,
30}
31
32impl DocFormat {
33 pub fn as_str(&self) -> &'static str {
34 match self {
35 Self::Markdown => "markdown",
36 Self::PlainText => "text",
37 Self::Rst => "rst",
38 Self::Asciidoc => "asciidoc",
39 Self::Org => "org",
40 Self::Pdf => "pdf",
41 Self::Docx => "docx",
42 Self::Pptx => "pptx",
43 Self::Xlsx => "xlsx",
44 Self::Html => "html",
45 Self::Rtf => "rtf",
46 Self::Xml => "xml",
47 }
48 }
49}
50
51pub fn extract_document(path: &Path, bytes: &[u8], ext: &str) -> Result<ExtractedDoc> {
52 let format = match ext {
53 "md" | "markdown" => DocFormat::Markdown,
54 "txt" => DocFormat::PlainText,
55 "rst" => DocFormat::Rst,
56 "adoc" => DocFormat::Asciidoc,
57 "org" => DocFormat::Org,
58 "pdf" => DocFormat::Pdf,
59 "docx" => DocFormat::Docx,
60 "pptx" => DocFormat::Pptx,
61 "xlsx" => DocFormat::Xlsx,
62 "html" | "htm" => DocFormat::Html,
63 "rtf" => DocFormat::Rtf,
64 "xml" | "xsl" | "xsd" | "svg" | "plist" => DocFormat::Xml,
65 _ => anyhow::bail!("unsupported document format: {ext}"),
66 };
67
68 let (text, title, page_count) = match format {
69 DocFormat::Markdown
70 | DocFormat::PlainText
71 | DocFormat::Rst
72 | DocFormat::Asciidoc
73 | DocFormat::Org => extract_text(bytes)?,
74 DocFormat::Pdf => extract_pdf(path, bytes)?,
75 DocFormat::Docx => extract_docx(bytes)?,
76 DocFormat::Pptx => extract_pptx(bytes)?,
77 DocFormat::Xlsx => extract_xlsx(bytes)?,
78 DocFormat::Html => extract_html(bytes)?,
79 DocFormat::Rtf => extract_rtf(bytes)?,
80 DocFormat::Xml => extract_xml(bytes)?,
81 };
82
83 Ok(ExtractedDoc {
84 file: String::new(),
85 title,
86 content_hash: String::new(),
87 format,
88 text,
89 page_count,
90 })
91}
92
93fn extract_text(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
94 let text = String::from_utf8_lossy(bytes).into_owned();
95 let title = text
96 .lines()
97 .next()
98 .map(|l| l.trim_start_matches('#').trim().to_string())
99 .filter(|t| !t.is_empty());
100 Ok((text, title, None))
101}
102
103fn extract_pdf(path: &Path, bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
104 match pdf_oxide::PdfDocument::from_bytes(bytes.to_vec()) {
105 Ok(doc) => {
106 let page_count = doc.page_count().unwrap_or(0);
107 let mut pages_text = Vec::new();
108 for i in 0..page_count {
109 match doc.extract_text(i) {
110 Ok(text) => pages_text.push(text),
111 Err(e) => {
112 eprintln!("warning: PDF page {} extraction failed in {}: {e}", i + 1, path.display());
113 }
114 }
115 }
116 let text = pages_text.join("\n");
117 let title = text.lines().next().map(|l| l.trim().to_string()).filter(|t| !t.is_empty());
118 let count = if page_count > 0 { Some(page_count) } else { None };
119 Ok((text, title, count))
120 }
121 Err(_) => {
122 let text = String::from_utf8_lossy(bytes);
123 if text.is_ascii() && text.len() > 10 {
124 let title = text.lines().next().map(|l| l.trim().to_string()).filter(|t| !t.is_empty());
125 Ok((text.into_owned(), title, None))
126 } else {
127 anyhow::bail!("PDF extraction failed: {}", path.display())
128 }
129 }
130 }
131}
132
133fn extract_docx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
134 let cursor = std::io::Cursor::new(bytes);
135 let mut archive = zip::ZipArchive::new(cursor).context("DOCX is not a valid ZIP archive")?;
136
137 let mut text = String::new();
138 let mut title = None;
139
140 if let Ok(mut file) = archive.by_name("word/document.xml") {
141 let mut xml = String::new();
142 std::io::Read::read_to_string(&mut file, &mut xml)?;
143 text = extract_text_from_ooxml(&xml);
144 title = text
145 .lines()
146 .next()
147 .map(|l| l.trim().to_string())
148 .filter(|t| !t.is_empty());
149 }
150
151 Ok((text, title, None))
152}
153
154fn extract_pptx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
155 let cursor = std::io::Cursor::new(bytes);
156 let mut archive = zip::ZipArchive::new(cursor).context("PPTX is not a valid ZIP archive")?;
157
158 let mut all_text = Vec::new();
159 let mut slide_names: Vec<String> = Vec::new();
160
161 for i in 0..archive.len() {
162 let file = archive.by_index(i)?;
163 let name = file.name().to_string();
164 if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
165 slide_names.push(name);
166 }
167 }
168 slide_names.sort();
169
170 let page_count = Some(slide_names.len());
171
172 for name in &slide_names {
173 if let Ok(mut file) = archive.by_name(name) {
174 let mut xml = String::new();
175 std::io::Read::read_to_string(&mut file, &mut xml)?;
176 let slide_text = extract_text_from_ooxml(&xml);
177 if !slide_text.is_empty() {
178 all_text.push(slide_text);
179 }
180 }
181 }
182
183 let text = all_text.join("\n\n");
184 let title = text
185 .lines()
186 .next()
187 .map(|l| l.trim().to_string())
188 .filter(|t| !t.is_empty());
189 Ok((text, title, page_count))
190}
191
192fn extract_xlsx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
193 let cursor = std::io::Cursor::new(bytes);
194 let mut workbook =
195 calamine::open_workbook_auto_from_rs(cursor).context("Failed to open spreadsheet")?;
196
197 let mut all_text = Vec::new();
198 let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
199 let page_count = Some(sheet_names.len());
200
201 for name in &sheet_names {
202 if let Ok(range) = workbook.worksheet_range(name) {
203 let mut sheet_text = format!("Sheet: {}\n", name);
204 for row in range.rows() {
205 let cells: Vec<String> = row.iter().map(|cell| format!("{}", cell)).collect();
206 let line = cells.join("\t");
207 if !line.trim().is_empty() {
208 sheet_text.push_str(&line);
209 sheet_text.push('\n');
210 }
211 }
212 all_text.push(sheet_text);
213 }
214 }
215
216 let text = all_text.join("\n");
217 let title = sheet_names.first().cloned();
218 Ok((text, title, page_count))
219}
220
221fn extract_html(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
222 let html = String::from_utf8_lossy(bytes);
223 let mut text = String::new();
224 let mut in_tag = false;
225 let mut title = None;
226
227 if let Some(start) = html.find("<title>") {
229 if let Some(end) = html[start..].find("</title>") {
230 title = Some(html[start + 7..start + end].trim().to_string());
231 }
232 }
233
234 for ch in html.chars() {
236 match ch {
237 '<' => in_tag = true,
238 '>' => {
239 in_tag = false;
240 if !text.ends_with('\n') && !text.ends_with(' ') {
241 text.push(' ');
242 }
243 }
244 _ if !in_tag => text.push(ch),
245 _ => {}
246 }
247 }
248
249 let text = regex::Regex::new(r"\s+")
251 .unwrap()
252 .replace_all(text.trim(), " ")
253 .into_owned();
254
255 Ok((text, title, None))
256}
257
258fn extract_rtf(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
259 let rtf = String::from_utf8_lossy(bytes);
260 let mut text = String::new();
261 let mut in_control = false;
262 let mut brace_depth = 0i32;
263
264 for ch in rtf.chars() {
265 match ch {
266 '{' => brace_depth += 1,
267 '}' => brace_depth -= 1,
268 '\\' => in_control = true,
269 ' ' | '\n' if in_control => {
270 in_control = false;
271 if brace_depth <= 2 {
272 text.push(' ');
273 }
274 }
275 _ if in_control => {}
276 _ if brace_depth <= 2 => text.push(ch),
277 _ => {}
278 }
279 }
280
281 let text = text.trim().to_string();
282 let title = text
283 .lines()
284 .next()
285 .map(|l| l.trim().to_string())
286 .filter(|t| !t.is_empty());
287 Ok((text, title, None))
288}
289
290fn extract_xml(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
291 let xml_str = String::from_utf8_lossy(bytes);
292 let mut text = String::new();
293 let mut title = None;
294 let mut reader = quick_xml::Reader::from_str(&xml_str);
295 let mut buf = Vec::new();
296 let mut depth = 0u32;
297
298 loop {
299 match reader.read_event_into(&mut buf) {
300 Ok(quick_xml::events::Event::Start(ref e)) => {
301 depth += 1;
302 if depth > 1 && !text.is_empty() && !text.ends_with('\n') {
303 text.push('\n');
304 }
305 if depth == 1 && title.is_none() {
307 let local = e.local_name();
308 let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
309 if !name.is_empty() {
310 title = Some(name.to_string());
311 }
312 }
313 }
314 Ok(quick_xml::events::Event::Text(ref e)) => {
315 if let Ok(t) = e.unescape() {
316 let trimmed = t.trim();
317 if !trimmed.is_empty() {
318 if !text.is_empty() && !text.ends_with('\n') && !text.ends_with(' ') {
319 text.push(' ');
320 }
321 text.push_str(trimmed);
322 }
323 }
324 }
325 Ok(quick_xml::events::Event::End(_)) => {
326 depth = depth.saturating_sub(1);
327 }
328 Ok(quick_xml::events::Event::Eof) => break,
329 Err(_) => break,
330 _ => {}
331 }
332 buf.clear();
333 }
334
335 let text = text.trim().to_string();
336 Ok((text, title, None))
337}
338
339fn extract_text_from_ooxml(xml: &str) -> String {
340 let mut text = String::new();
341 let mut reader = quick_xml::Reader::from_str(xml);
342 let mut in_text = false;
343 let mut buf = Vec::new();
344
345 loop {
346 match reader.read_event_into(&mut buf) {
347 Ok(quick_xml::events::Event::Start(ref e))
348 | Ok(quick_xml::events::Event::Empty(ref e)) => {
349 let local = e.local_name();
350 let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
351 if name == "t" {
352 in_text = true;
353 }
354 if name == "p" && !text.is_empty() && !text.ends_with('\n') {
356 text.push('\n');
357 }
358 }
359 Ok(quick_xml::events::Event::Text(ref e)) if in_text => {
360 if let Ok(t) = e.unescape() {
361 text.push_str(&t);
362 }
363 }
364 Ok(quick_xml::events::Event::End(ref e)) => {
365 let local = e.local_name();
366 let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
367 if name == "t" {
368 in_text = false;
369 }
370 }
371 Ok(quick_xml::events::Event::Eof) => break,
372 Err(_) => break,
373 _ => {}
374 }
375 buf.clear();
376 }
377
378 text
379}