Skip to main content

marque_extract/
extractor.rs

1//! Document text extraction with streaming support.
2//!
3//! TODO: wire Kreuzberg once crate dependency is confirmed.
4//! Current implementation is a stub that reads raw text files only.
5
6use crate::metadata::MetadataReport;
7use std::path::Path;
8use thiserror::Error;
9
10#[derive(Debug, Error)]
11pub enum ExtractError {
12    #[error("unsupported format: {0}")]
13    UnsupportedFormat(String),
14
15    #[error("extraction failed: {0}")]
16    ExtractionFailed(String),
17
18    #[error("I/O error: {0}")]
19    Io(#[from] std::io::Error),
20}
21
22/// Options controlling extraction behavior.
23#[derive(Debug, Clone, Default)]
24pub struct ExtractionOptions {
25    /// Extract and report document metadata.
26    pub extract_metadata: bool,
27    /// Remove metadata from the output document (creates a sanitized copy).
28    pub strip_metadata: bool,
29    /// Attempt OCR on image-based pages (requires OCR backend).
30    pub ocr: bool,
31}
32
33/// Result of document extraction.
34#[derive(Debug)]
35pub struct ExtractedDocument {
36    /// Extracted text content, UTF-8.
37    pub text: Vec<u8>,
38    /// Metadata report (populated if `extract_metadata` was set).
39    pub metadata: Option<MetadataReport>,
40    /// Original format detected.
41    pub format: DetectedFormat,
42}
43
44#[derive(Debug, Clone, PartialEq, Eq)]
45pub enum DetectedFormat {
46    PlainText,
47    Docx,
48    Pdf,
49    Html,
50    Xlsx,
51    Pptx,
52    Email,
53    Unknown(String),
54}
55
56/// Stateless document extractor.
57pub struct Extractor;
58
59impl Extractor {
60    /// Extract text (and optionally metadata) from a file.
61    pub async fn extract(
62        path: &Path,
63        _opts: ExtractionOptions,
64    ) -> Result<ExtractedDocument, ExtractError> {
65        // TODO: delegate to Kreuzberg for full format support.
66        // Stub: read raw bytes and return as-is for plain text.
67        let ext = path
68            .extension()
69            .and_then(|e| e.to_str())
70            .unwrap_or("")
71            .to_lowercase();
72
73        match ext.as_str() {
74            "txt" | "text" => {
75                let text = tokio::fs::read(path).await?;
76                Ok(ExtractedDocument {
77                    text,
78                    metadata: None,
79                    format: DetectedFormat::PlainText,
80                })
81            }
82            "docx" => Err(ExtractError::UnsupportedFormat(
83                "docx extraction requires Kreuzberg integration (TODO)".into(),
84            )),
85            "pdf" => Err(ExtractError::UnsupportedFormat(
86                "pdf extraction requires Kreuzberg integration (TODO)".into(),
87            )),
88            other => Err(ExtractError::UnsupportedFormat(other.to_owned())),
89        }
90    }
91
92    /// Extract from an in-memory buffer with an explicit format hint.
93    pub fn extract_bytes(
94        data: &[u8],
95        format: DetectedFormat,
96        _opts: ExtractionOptions,
97    ) -> Result<ExtractedDocument, ExtractError> {
98        match format {
99            DetectedFormat::PlainText => Ok(ExtractedDocument {
100                text: data.to_vec(),
101                metadata: None,
102                format: DetectedFormat::PlainText,
103            }),
104            _ => Err(ExtractError::UnsupportedFormat(
105                "non-text extraction requires Kreuzberg integration (TODO)".into(),
106            )),
107        }
108    }
109}