Skip to main content

marque_extract/
extractor.rs

1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Document text extraction with streaming support.
6//!
7//! TODO: wire Kreuzberg once crate dependency is confirmed.
8//! Current implementation is a stub that reads raw text files only.
9
10use crate::metadata::MetadataReport;
11use std::path::Path;
12use thiserror::Error;
13
14#[derive(Debug, Error)]
15pub enum ExtractError {
16    #[error("unsupported format: {0}")]
17    UnsupportedFormat(String),
18
19    #[error("extraction failed: {0}")]
20    ExtractionFailed(String),
21
22    #[error("I/O error: {0}")]
23    Io(#[from] std::io::Error),
24}
25
26/// Options controlling extraction behavior.
27#[derive(Debug, Clone, Default)]
28pub struct ExtractionOptions {
29    /// Extract and report document metadata.
30    pub extract_metadata: bool,
31    /// Remove metadata from the output document (creates a sanitized copy).
32    pub strip_metadata: bool,
33    /// Attempt OCR on image-based pages (requires OCR backend).
34    pub ocr: bool,
35}
36
37/// Result of document extraction.
38#[derive(Debug)]
39pub struct ExtractedDocument {
40    /// Extracted text content, UTF-8.
41    pub text: Vec<u8>,
42    /// Metadata report (populated if `extract_metadata` was set).
43    pub metadata: Option<MetadataReport>,
44    /// Original format detected.
45    pub format: DetectedFormat,
46}
47
48#[derive(Debug, Clone, PartialEq, Eq)]
49pub enum DetectedFormat {
50    PlainText,
51    Docx,
52    Pdf,
53    Html,
54    Xlsx,
55    Pptx,
56    Email,
57    Unknown(String),
58}
59
60/// Stateless document extractor.
61pub struct Extractor;
62
63impl Extractor {
64    /// Extract text (and optionally metadata) from a file.
65    pub async fn extract(
66        path: &Path,
67        _opts: ExtractionOptions,
68    ) -> Result<ExtractedDocument, ExtractError> {
69        // TODO: delegate to Kreuzberg for full format support.
70        // Stub: read raw bytes and return as-is for plain text.
71        let ext = path
72            .extension()
73            .and_then(|e| e.to_str())
74            .unwrap_or("")
75            .to_lowercase();
76
77        match ext.as_str() {
78            "txt" | "text" => {
79                let text = tokio::fs::read(path).await?;
80                Ok(ExtractedDocument {
81                    text,
82                    metadata: None,
83                    format: DetectedFormat::PlainText,
84                })
85            }
86            "docx" => Err(ExtractError::UnsupportedFormat(
87                "docx extraction requires Kreuzberg integration (TODO)".into(),
88            )),
89            "pdf" => Err(ExtractError::UnsupportedFormat(
90                "pdf extraction requires Kreuzberg integration (TODO)".into(),
91            )),
92            other => Err(ExtractError::UnsupportedFormat(other.to_owned())),
93        }
94    }
95
96    /// Extract from an in-memory buffer with an explicit format hint.
97    pub fn extract_bytes(
98        data: &[u8],
99        format: DetectedFormat,
100        _opts: ExtractionOptions,
101    ) -> Result<ExtractedDocument, ExtractError> {
102        match format {
103            DetectedFormat::PlainText => Ok(ExtractedDocument {
104                text: data.to_vec(),
105                metadata: None,
106                format: DetectedFormat::PlainText,
107            }),
108            _ => Err(ExtractError::UnsupportedFormat(
109                "non-text extraction requires Kreuzberg integration (TODO)".into(),
110            )),
111        }
112    }
113}