Skip to main content

infigraph_docs/
extract.rs

1use std::path::Path;
2
3use anyhow::{Context, Result};
4use calamine::Reader;
5
6#[derive(Debug, Clone)]
7pub struct ExtractedDoc {
8    pub file: String,
9    pub title: Option<String>,
10    pub content_hash: String,
11    pub format: DocFormat,
12    pub text: String,
13    pub page_count: Option<usize>,
14}
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum DocFormat {
18    Markdown,
19    PlainText,
20    Rst,
21    Asciidoc,
22    Org,
23    Pdf,
24    Docx,
25    Pptx,
26    Xlsx,
27    Html,
28    Rtf,
29    Xml,
30}
31
32impl DocFormat {
33    pub fn as_str(&self) -> &'static str {
34        match self {
35            Self::Markdown => "markdown",
36            Self::PlainText => "text",
37            Self::Rst => "rst",
38            Self::Asciidoc => "asciidoc",
39            Self::Org => "org",
40            Self::Pdf => "pdf",
41            Self::Docx => "docx",
42            Self::Pptx => "pptx",
43            Self::Xlsx => "xlsx",
44            Self::Html => "html",
45            Self::Rtf => "rtf",
46            Self::Xml => "xml",
47        }
48    }
49}
50
51pub fn extract_document(path: &Path, bytes: &[u8], ext: &str) -> Result<ExtractedDoc> {
52    let format = match ext {
53        "md" | "markdown" => DocFormat::Markdown,
54        "txt" => DocFormat::PlainText,
55        "rst" => DocFormat::Rst,
56        "adoc" => DocFormat::Asciidoc,
57        "org" => DocFormat::Org,
58        "pdf" => DocFormat::Pdf,
59        "docx" => DocFormat::Docx,
60        "pptx" => DocFormat::Pptx,
61        "xlsx" => DocFormat::Xlsx,
62        "html" | "htm" => DocFormat::Html,
63        "rtf" => DocFormat::Rtf,
64        "xml" | "xsl" | "xsd" | "svg" | "plist" => DocFormat::Xml,
65        _ => anyhow::bail!("unsupported document format: {ext}"),
66    };
67
68    let (text, title, page_count) = match format {
69        DocFormat::Markdown
70        | DocFormat::PlainText
71        | DocFormat::Rst
72        | DocFormat::Asciidoc
73        | DocFormat::Org => extract_text(bytes)?,
74        DocFormat::Pdf => extract_pdf(path, bytes)?,
75        DocFormat::Docx => extract_docx(bytes)?,
76        DocFormat::Pptx => extract_pptx(bytes)?,
77        DocFormat::Xlsx => extract_xlsx(bytes)?,
78        DocFormat::Html => extract_html(bytes)?,
79        DocFormat::Rtf => extract_rtf(bytes)?,
80        DocFormat::Xml => extract_xml(bytes)?,
81    };
82
83    Ok(ExtractedDoc {
84        file: String::new(),
85        title,
86        content_hash: String::new(),
87        format,
88        text,
89        page_count,
90    })
91}
92
93fn extract_text(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
94    let text = String::from_utf8_lossy(bytes).into_owned();
95    let title = text
96        .lines()
97        .next()
98        .map(|l| l.trim_start_matches('#').trim().to_string())
99        .filter(|t| !t.is_empty());
100    Ok((text, title, None))
101}
102
103fn extract_pdf(path: &Path, bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
104    match pdf_oxide::PdfDocument::from_bytes(bytes.to_vec()) {
105        Ok(doc) => {
106            let page_count = doc.page_count().unwrap_or(0);
107            let mut pages_text = Vec::new();
108            for i in 0..page_count {
109                match doc.extract_text(i) {
110                    Ok(text) => pages_text.push(text),
111                    Err(e) => {
112                        eprintln!("warning: PDF page {} extraction failed in {}: {e}", i + 1, path.display());
113                    }
114                }
115            }
116            let text = pages_text.join("\n");
117            let title = text.lines().next().map(|l| l.trim().to_string()).filter(|t| !t.is_empty());
118            let count = if page_count > 0 { Some(page_count) } else { None };
119            Ok((text, title, count))
120        }
121        Err(_) => {
122            let text = String::from_utf8_lossy(bytes);
123            if text.is_ascii() && text.len() > 10 {
124                let title = text.lines().next().map(|l| l.trim().to_string()).filter(|t| !t.is_empty());
125                Ok((text.into_owned(), title, None))
126            } else {
127                anyhow::bail!("PDF extraction failed: {}", path.display())
128            }
129        }
130    }
131}
132
133fn extract_docx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
134    let cursor = std::io::Cursor::new(bytes);
135    let mut archive = zip::ZipArchive::new(cursor).context("DOCX is not a valid ZIP archive")?;
136
137    let mut text = String::new();
138    let mut title = None;
139
140    if let Ok(mut file) = archive.by_name("word/document.xml") {
141        let mut xml = String::new();
142        std::io::Read::read_to_string(&mut file, &mut xml)?;
143        text = extract_text_from_ooxml(&xml);
144        title = text
145            .lines()
146            .next()
147            .map(|l| l.trim().to_string())
148            .filter(|t| !t.is_empty());
149    }
150
151    Ok((text, title, None))
152}
153
154fn extract_pptx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
155    let cursor = std::io::Cursor::new(bytes);
156    let mut archive = zip::ZipArchive::new(cursor).context("PPTX is not a valid ZIP archive")?;
157
158    let mut all_text = Vec::new();
159    let mut slide_names: Vec<String> = Vec::new();
160
161    for i in 0..archive.len() {
162        let file = archive.by_index(i)?;
163        let name = file.name().to_string();
164        if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
165            slide_names.push(name);
166        }
167    }
168    slide_names.sort();
169
170    let page_count = Some(slide_names.len());
171
172    for name in &slide_names {
173        if let Ok(mut file) = archive.by_name(name) {
174            let mut xml = String::new();
175            std::io::Read::read_to_string(&mut file, &mut xml)?;
176            let slide_text = extract_text_from_ooxml(&xml);
177            if !slide_text.is_empty() {
178                all_text.push(slide_text);
179            }
180        }
181    }
182
183    let text = all_text.join("\n\n");
184    let title = text
185        .lines()
186        .next()
187        .map(|l| l.trim().to_string())
188        .filter(|t| !t.is_empty());
189    Ok((text, title, page_count))
190}
191
192fn extract_xlsx(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
193    let cursor = std::io::Cursor::new(bytes);
194    let mut workbook =
195        calamine::open_workbook_auto_from_rs(cursor).context("Failed to open spreadsheet")?;
196
197    let mut all_text = Vec::new();
198    let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
199    let page_count = Some(sheet_names.len());
200
201    for name in &sheet_names {
202        if let Ok(range) = workbook.worksheet_range(name) {
203            let mut sheet_text = format!("Sheet: {}\n", name);
204            for row in range.rows() {
205                let cells: Vec<String> = row.iter().map(|cell| format!("{}", cell)).collect();
206                let line = cells.join("\t");
207                if !line.trim().is_empty() {
208                    sheet_text.push_str(&line);
209                    sheet_text.push('\n');
210                }
211            }
212            all_text.push(sheet_text);
213        }
214    }
215
216    let text = all_text.join("\n");
217    let title = sheet_names.first().cloned();
218    Ok((text, title, page_count))
219}
220
221fn extract_html(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
222    let html = String::from_utf8_lossy(bytes);
223    let mut text = String::new();
224    let mut in_tag = false;
225    let mut title = None;
226
227    // Extract title from <title> tag
228    if let Some(start) = html.find("<title>") {
229        if let Some(end) = html[start..].find("</title>") {
230            title = Some(html[start + 7..start + end].trim().to_string());
231        }
232    }
233
234    // Strip HTML tags — simple but effective for text extraction
235    for ch in html.chars() {
236        match ch {
237            '<' => in_tag = true,
238            '>' => {
239                in_tag = false;
240                if !text.ends_with('\n') && !text.ends_with(' ') {
241                    text.push(' ');
242                }
243            }
244            _ if !in_tag => text.push(ch),
245            _ => {}
246        }
247    }
248
249    // Collapse whitespace
250    let text = regex::Regex::new(r"\s+")
251        .unwrap()
252        .replace_all(text.trim(), " ")
253        .into_owned();
254
255    Ok((text, title, None))
256}
257
258fn extract_rtf(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
259    let rtf = String::from_utf8_lossy(bytes);
260    let mut text = String::new();
261    let mut in_control = false;
262    let mut brace_depth = 0i32;
263
264    for ch in rtf.chars() {
265        match ch {
266            '{' => brace_depth += 1,
267            '}' => brace_depth -= 1,
268            '\\' => in_control = true,
269            ' ' | '\n' if in_control => {
270                in_control = false;
271                if brace_depth <= 2 {
272                    text.push(' ');
273                }
274            }
275            _ if in_control => {}
276            _ if brace_depth <= 2 => text.push(ch),
277            _ => {}
278        }
279    }
280
281    let text = text.trim().to_string();
282    let title = text
283        .lines()
284        .next()
285        .map(|l| l.trim().to_string())
286        .filter(|t| !t.is_empty());
287    Ok((text, title, None))
288}
289
290fn extract_xml(bytes: &[u8]) -> Result<(String, Option<String>, Option<usize>)> {
291    let xml_str = String::from_utf8_lossy(bytes);
292    let mut text = String::new();
293    let mut title = None;
294    let mut reader = quick_xml::Reader::from_str(&xml_str);
295    let mut buf = Vec::new();
296    let mut depth = 0u32;
297
298    loop {
299        match reader.read_event_into(&mut buf) {
300            Ok(quick_xml::events::Event::Start(ref e)) => {
301                depth += 1;
302                if depth > 1 && !text.is_empty() && !text.ends_with('\n') {
303                    text.push('\n');
304                }
305                // Use root element name as title if not set
306                if depth == 1 && title.is_none() {
307                    let local = e.local_name();
308                    let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
309                    if !name.is_empty() {
310                        title = Some(name.to_string());
311                    }
312                }
313            }
314            Ok(quick_xml::events::Event::Text(ref e)) => {
315                if let Ok(t) = e.unescape() {
316                    let trimmed = t.trim();
317                    if !trimmed.is_empty() {
318                        if !text.is_empty() && !text.ends_with('\n') && !text.ends_with(' ') {
319                            text.push(' ');
320                        }
321                        text.push_str(trimmed);
322                    }
323                }
324            }
325            Ok(quick_xml::events::Event::End(_)) => {
326                depth = depth.saturating_sub(1);
327            }
328            Ok(quick_xml::events::Event::Eof) => break,
329            Err(_) => break,
330            _ => {}
331        }
332        buf.clear();
333    }
334
335    let text = text.trim().to_string();
336    Ok((text, title, None))
337}
338
339fn extract_text_from_ooxml(xml: &str) -> String {
340    let mut text = String::new();
341    let mut reader = quick_xml::Reader::from_str(xml);
342    let mut in_text = false;
343    let mut buf = Vec::new();
344
345    loop {
346        match reader.read_event_into(&mut buf) {
347            Ok(quick_xml::events::Event::Start(ref e))
348            | Ok(quick_xml::events::Event::Empty(ref e)) => {
349                let local = e.local_name();
350                let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
351                if name == "t" {
352                    in_text = true;
353                }
354                // Paragraph boundary
355                if name == "p" && !text.is_empty() && !text.ends_with('\n') {
356                    text.push('\n');
357                }
358            }
359            Ok(quick_xml::events::Event::Text(ref e)) if in_text => {
360                if let Ok(t) = e.unescape() {
361                    text.push_str(&t);
362                }
363            }
364            Ok(quick_xml::events::Event::End(ref e)) => {
365                let local = e.local_name();
366                let name = std::str::from_utf8(local.as_ref()).unwrap_or("");
367                if name == "t" {
368                    in_text = false;
369                }
370            }
371            Ok(quick_xml::events::Event::Eof) => break,
372            Err(_) => break,
373            _ => {}
374        }
375        buf.clear();
376    }
377
378    text
379}