marque_extract/
extractor.rs1use crate::metadata::MetadataReport;
11use std::path::Path;
12use thiserror::Error;
13
14#[derive(Debug, Error)]
15pub enum ExtractError {
16 #[error("unsupported format: {0}")]
17 UnsupportedFormat(String),
18
19 #[error("extraction failed: {0}")]
20 ExtractionFailed(String),
21
22 #[error("I/O error: {0}")]
23 Io(#[from] std::io::Error),
24}
25
26#[derive(Debug, Clone, Default)]
28pub struct ExtractionOptions {
29 pub extract_metadata: bool,
31 pub strip_metadata: bool,
33 pub ocr: bool,
35}
36
37#[derive(Debug)]
39pub struct ExtractedDocument {
40 pub text: Vec<u8>,
42 pub metadata: Option<MetadataReport>,
44 pub format: DetectedFormat,
46}
47
48#[derive(Debug, Clone, PartialEq, Eq)]
49pub enum DetectedFormat {
50 PlainText,
51 Docx,
52 Pdf,
53 Html,
54 Xlsx,
55 Pptx,
56 Email,
57 Unknown(String),
58}
59
60pub struct Extractor;
62
63impl Extractor {
64 pub async fn extract(
66 path: &Path,
67 _opts: ExtractionOptions,
68 ) -> Result<ExtractedDocument, ExtractError> {
69 let ext = path
72 .extension()
73 .and_then(|e| e.to_str())
74 .unwrap_or("")
75 .to_lowercase();
76
77 match ext.as_str() {
78 "txt" | "text" => {
79 let text = tokio::fs::read(path).await?;
80 Ok(ExtractedDocument {
81 text,
82 metadata: None,
83 format: DetectedFormat::PlainText,
84 })
85 }
86 "docx" => Err(ExtractError::UnsupportedFormat(
87 "docx extraction requires Kreuzberg integration (TODO)".into(),
88 )),
89 "pdf" => Err(ExtractError::UnsupportedFormat(
90 "pdf extraction requires Kreuzberg integration (TODO)".into(),
91 )),
92 other => Err(ExtractError::UnsupportedFormat(other.to_owned())),
93 }
94 }
95
96 pub fn extract_bytes(
98 data: &[u8],
99 format: DetectedFormat,
100 _opts: ExtractionOptions,
101 ) -> Result<ExtractedDocument, ExtractError> {
102 match format {
103 DetectedFormat::PlainText => Ok(ExtractedDocument {
104 text: data.to_vec(),
105 metadata: None,
106 format: DetectedFormat::PlainText,
107 }),
108 _ => Err(ExtractError::UnsupportedFormat(
109 "non-text extraction requires Kreuzberg integration (TODO)".into(),
110 )),
111 }
112 }
113}