Skip to main content

infigraph_docs/
extract.rs

1use std::path::Path;
2
3use anyhow::{Context, Result};
4use calamine::Reader;
5
6#[derive(Debug, Clone)]
7pub struct ExtractedDoc {
8    pub file: String,
9    pub title: Option<String>,
10    pub content_hash: String,
11    pub format: DocFormat,
12    pub text: String,
13    pub page_count: Option<usize>,
14}
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum DocFormat {
18    Markdown,
19    PlainText,
20    Rst,
21    Asciidoc,
22    Org,
23    Pdf,
24    Docx,
25    Pptx,
26    Xlsx,
27    Html,
28    Rtf,
29    Xml,
30}
31
32impl DocFormat {
33    pub fn as_str(&self) -> &'static str {
34        match self {
35            Self::Markdown => "markdown",
36            Self::PlainText => "text",
37            Self::Rst => "rst",
38            Self::Asciidoc => "asciidoc",
39            Self::Org => "org",
40            Self::Pdf => "pdf",
41            Self::Docx => "docx",
42            Self::Pptx => "pptx",
43            Self::Xlsx => "xlsx",
44            Self::Html => "html",
45            Self::Rtf => "rtf",
46            Self::Xml => "xml",
47        }
48    }
49}
50
51pub fn extract_document(path: &Path, bytes: &[u8], ext: &str) -> Result<ExtractedDoc> {
52    let format = match ext {
53        "md" | "markdown" => DocFormat::Markdown,
54        "txt" => DocFormat::PlainText,
55        "rst" => DocFormat::Rst,
56        "adoc" => DocFormat::Asciidoc,
57        "org" => DocFormat::Org,
58        "pdf" => DocFormat::Pdf,
59        "docx" => DocFormat::Docx,
60        "pptx" => DocFormat::Pptx,
61        "xlsx" => DocFormat::Xlsx,
62        "html" | "htm" => DocFormat::Html,
63        "rtf" => DocFormat::Rtf,
64        "xml" | "xsl" | "xsd" | "svg" | "plist" => DocFormat::Xml,
65        _ => anyhow::bail!("unsupported document format: {ext}"),
66    };
67
68    let (text, title, page_count) = match format {
69        DocFormat::Markdown
70        | DocFormat::PlainText
71        | DocFormat::Rst
72        | DocFormat::Asciidoc
73        | DocFormat::Org => extract_text(bytes)?,
74        DocFormat::Pdf => extract_pdf(path, bytes)?,
75        DocFormat::Docx => extract_docx(bytes)?,
76        DocFormat::Pptx => extract_pptx(bytes)?,
77        DocFormat::Xlsx => extract_xlsx(bytes)?,
78        DocFormat::Html => extract_html(bytes)?,
79        DocFormat::Rtf => extract_rtf(bytes)?,
80        DocFormat::Xml => extract_xml(bytes)?,
81    };
82
83    Ok(ExtractedDoc {
84        file: String::new(),
85        title,
86        content_hash: String::new(),
87        format,
88        text,
89        page_count,
90    })
91}
92
93fn extract_text(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
94    let text = String::from_utf8_lossy(bytes).into_owned();
95    let title = text
96        .lines()
97        .next()
98        .map(|l| l.trim_start_matches('#').trim().to_string())
99        .filter(|t| !t.is_empty());
100    Ok((text, title, None))
101}
102
103fn extract_pdf(path: &Path, bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
104    match pdf_oxide::PdfDocument::from_bytes(bytes.to_vec()) {
105        Ok(doc) => {
106            let page_count = doc.page_count().unwrap_or(0);
107            let mut pages_text = Vec::new();
108            for i in 0..page_count {
109                match doc.extract_text(i) {
110                    Ok(text) => pages_text.push(text),
111                    Err(e) => {
112                        eprintln!(
113                            "warning: PDF page {} extraction failed in {}: {e}",
114                            i + 1,
115                            path.display()
116                        );
117                    }
118                }
119            }
120            let text = pages_text.join("\n");
121            let title = text
122                .lines()
123                .next()
124                .map(|l| l.trim().to_string())
125                .filter(|t| !t.is_empty());
126            let count = if page_count > 0 {
127                Some(page_count)
128            } else {
129                None
130            };
131            Ok((text, title, count))
132        }
133        Err(_) => {
134            let text = String::from_utf8_lossy(bytes);
135            if text.is_ascii() && text.len() > 10 {
136                let title = text
137                    .lines()
138                    .next()
139                    .map(|l| l.trim().to_string())
140                    .filter(|t| !t.is_empty());
141                Ok((text.into_owned(), title, None))
142            } else {
143                anyhow::bail!("PDF extraction failed: {}", path.display())
144            }
145        }
146    }
147}
148
149fn extract_docx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
150    let cursor = std::io::Cursor::new(bytes);
151    let mut archive = zip::ZipArchive::new(cursor).context("DOCX is not a valid ZIP archive")?;
152
153    let mut text = String::new();
154    let mut title = None;
155
156    if let Ok(mut file) = archive.by_name("word/document.xml") {
157        let mut xml = String::new();
158        std::io::Read::read_to_string(&mut file, &mut xml)?;
159        text = extract_text_from_ooxml(&xml);
160        title = text
161            .lines()
162            .next()
163            .map(|l| l.trim().to_string())
164            .filter(|t| !t.is_empty());
165    }
166
167    Ok((text, title, None))
168}
169
170fn extract_pptx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
171    let cursor = std::io::Cursor::new(bytes);
172    let mut archive = zip::ZipArchive::new(cursor).context("PPTX is not a valid ZIP archive")?;
173
174    let mut all_text = Vec::new();
175    let mut slide_names: Vec<String> = Vec::new();
176
177    for i in 0..archive.len() {
178        let file = archive.by_index(i)?;
179        let name = file.name().to_string();
180        if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
181            slide_names.push(name);
182        }
183    }
184    slide_names.sort();
185
186    let page_count = Some(slide_names.len());
187
188    for name in &slide_names {
189        if let Ok(mut file) = archive.by_name(name) {
190            let mut xml = String::new();
191            std::io::Read::read_to_string(&mut file, &mut xml)?;
192            let slide_text = extract_text_from_ooxml(&xml);
193            if !slide_text.is_empty() {
194                all_text.push(slide_text);
195            }
196        }
197    }
198
199    let text = all_text.join("\n\n");
200    let title = text
201        .lines()
202        .next()
203        .map(|l| l.trim().to_string())
204        .filter(|t| !t.is_empty());
205    Ok((text, title, page_count))
206}
207
208fn extract_xlsx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
209    let cursor = std::io::Cursor::new(bytes);
210    let mut workbook =
211        calamine::open_workbook_auto_from_rs(cursor).context("Failed to open spreadsheet")?;
212
213    let mut all_text = Vec::new();
214    let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
215    let page_count = Some(sheet_names.len());
216
217    for name in &sheet_names {
218        if let Ok(range) = workbook.worksheet_range(name) {
219            let mut sheet_text = format!("Sheet: {}\n", name);
220            for row in range.rows() {
221                let cells: Vec<String> = row.iter().map(|cell| format!("{}", cell)).collect();
222                let line = cells.join("\t");
223                if !line.trim().is_empty() {
224                    sheet_text.push_str(&line);
225                    sheet_text.push('\n');
226                }
227            }
228            all_text.push(sheet_text);
229        }
230    }
231
232    let text = all_text.join("\n");
233    let title = sheet_names.first().cloned();
234    Ok((text, title, page_count))
235}
236
237fn extract_html(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
238    let html = String::from_utf8_lossy(bytes);
239    let mut text = String::new();
240    let mut in_tag = false;
241    let mut title = None;
242
243    // Extract title from <title> tag
244    if let Some(start) = html.find("<title>") {
245        if let Some(end) = html[start..].find("</title>") {
246            title = Some(html[start + 7..start + end].trim().to_string());
247        }
248    }
249
250    // Strip HTML tags — simple but effective for text extraction
251    for ch in html.chars() {
252        match ch {
253            '<' => in_tag = true,
254            '>' => {
255                in_tag = false;
256                if !text.ends_with('\n') && !text.ends_with(' ') {
257                    text.push(' ');
258                }
259            }
260            _ if !in_tag => text.push(ch),
261            _ => {}
262        }
263    }
264
265    // Collapse whitespace
266    let text = regex::Regex::new(r"\s+")
267        .unwrap()
268        .replace_all(text.trim(), " ")
269        .into_owned();
270
271    Ok((text, title, None))
272}
273
274fn extract_rtf(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
275    let rtf = String::from_utf8_lossy(bytes);
276    let mut text = String::new();
277    let mut in_control = false;
278    let mut brace_depth = 0i32;
279
280    for ch in rtf.chars() {
281        match ch {
282            '{' => brace_depth += 1,
283            '}' => brace_depth -= 1,
284            '\\' => in_control = true,
285            ' ' | '\n' if in_control => {
286                in_control = false;
287                if brace_depth <= 2 {
288                    text.push(' ');
289                }
290            }
291            _ if in_control => {}
292            _ if brace_depth <= 2 => text.push(ch),
293            _ => {}
294        }
295    }
296
297    let text = text.trim().to_string();
298    let title = text
299        .lines()
300        .next()
301        .map(|l| l.trim().to_string())
302        .filter(|t| !t.is_empty());
303    Ok((text, title, None))
304}
305
306fn extract_xml(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
307    let xml_str = String::from_utf8_lossy(bytes);
308    let mut text = String::new();
309    let mut title = None;
310    let mut reader = quick_xml::Reader::from_str(&xml_str);
311    let mut buf = Vec::new();
312    let mut depth = 0u32;
313
314    loop {
315        match reader.read_event_into(&mut buf) {
316            Ok(quick_xml::events::Event::Start(ref e)) => {
317                depth += 1;
318                if depth > 1 && !text.is_empty() && !text.ends_with('\n') {
319                    text.push('\n');
320                }
321                // Use root element name as title if not set
322                if depth == 1 && title.is_none() {
323                    let local = e.local_name();
324                    let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
325                    if !name.is_empty() {
326                        title = Some(name.to_string());
327                    }
328                }
329            }
330            Ok(quick_xml::events::Event::Text(ref e)) => {
331                if let Ok(t) = e.unescape() {
332                    let trimmed = t.trim();
333                    if !trimmed.is_empty() {
334                        if !text.is_empty() && !text.ends_with('\n') && !text.ends_with(' ') {
335                            text.push(' ');
336                        }
337                        text.push_str(trimmed);
338                    }
339                }
340            }
341            Ok(quick_xml::events::Event::End(_)) => {
342                depth = depth.saturating_sub(1);
343            }
344            Ok(quick_xml::events::Event::Eof) => break,
345            Err(_) => break,
346            _ => {}
347        }
348        buf.clear();
349    }
350
351    let text = text.trim().to_string();
352    Ok((text, title, None))
353}
354
355fn extract_text_from_ooxml(xml: &str) -> String {
356    let mut text = String::new();
357    let mut reader = quick_xml::Reader::from_str(xml);
358    let mut in_text = false;
359    let mut buf = Vec::new();
360
361    loop {
362        match reader.read_event_into(&mut buf) {
363            Ok(quick_xml::events::Event::Start(ref e))
364            | Ok(quick_xml::events::Event::Empty(ref e)) => {
365                let local = e.local_name();
366                let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
367                if name == "t" {
368                    in_text = true;
369                }
370                // Paragraph boundary
371                if name == "p" && !text.is_empty() && !text.ends_with('\n') {
372                    text.push('\n');
373                }
374            }
375            Ok(quick_xml::events::Event::Text(ref e)) if in_text => {
376                if let Ok(t) = e.unescape() {
377                    text.push_str(&t);
378                }
379            }
380            Ok(quick_xml::events::Event::End(ref e)) => {
381                let local = e.local_name();
382                let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
383                if name == "t" {
384                    in_text = false;
385                }
386            }
387            Ok(quick_xml::events::Event::Eof) => break,
388            Err(_) => break,
389            _ => {}
390        }
391        buf.clear();
392    }
393
394    text
395}