marque_extract/
extractor.rs1use crate::metadata::MetadataReport;
7use std::path::Path;
8use thiserror::Error;
9
10#[derive(Debug, Error)]
11pub enum ExtractError {
12 #[error("unsupported format: {0}")]
13 UnsupportedFormat(String),
14
15 #[error("extraction failed: {0}")]
16 ExtractionFailed(String),
17
18 #[error("I/O error: {0}")]
19 Io(#[from] std::io::Error),
20}
21
22#[derive(Debug, Clone, Default)]
24pub struct ExtractionOptions {
25 pub extract_metadata: bool,
27 pub strip_metadata: bool,
29 pub ocr: bool,
31}
32
33#[derive(Debug)]
35pub struct ExtractedDocument {
36 pub text: Vec<u8>,
38 pub metadata: Option<MetadataReport>,
40 pub format: DetectedFormat,
42}
43
44#[derive(Debug, Clone, PartialEq, Eq)]
45pub enum DetectedFormat {
46 PlainText,
47 Docx,
48 Pdf,
49 Html,
50 Xlsx,
51 Pptx,
52 Email,
53 Unknown(String),
54}
55
56pub struct Extractor;
58
59impl Extractor {
60 pub async fn extract(
62 path: &Path,
63 _opts: ExtractionOptions,
64 ) -> Result<ExtractedDocument, ExtractError> {
65 let ext = path
68 .extension()
69 .and_then(|e| e.to_str())
70 .unwrap_or("")
71 .to_lowercase();
72
73 match ext.as_str() {
74 "txt" | "text" => {
75 let text = tokio::fs::read(path).await?;
76 Ok(ExtractedDocument {
77 text,
78 metadata: None,
79 format: DetectedFormat::PlainText,
80 })
81 }
82 "docx" => Err(ExtractError::UnsupportedFormat(
83 "docx extraction requires Kreuzberg integration (TODO)".into(),
84 )),
85 "pdf" => Err(ExtractError::UnsupportedFormat(
86 "pdf extraction requires Kreuzberg integration (TODO)".into(),
87 )),
88 other => Err(ExtractError::UnsupportedFormat(other.to_owned())),
89 }
90 }
91
92 pub fn extract_bytes(
94 data: &[u8],
95 format: DetectedFormat,
96 _opts: ExtractionOptions,
97 ) -> Result<ExtractedDocument, ExtractError> {
98 match format {
99 DetectedFormat::PlainText => Ok(ExtractedDocument {
100 text: data.to_vec(),
101 metadata: None,
102 format: DetectedFormat::PlainText,
103 }),
104 _ => Err(ExtractError::UnsupportedFormat(
105 "non-text extraction requires Kreuzberg integration (TODO)".into(),
106 )),
107 }
108 }
109}