use crate::metadata::MetadataReport;
use std::path::Path;
use thiserror::Error;
#[derive(Debug, Error)]
pub enum ExtractError {
#[error("unsupported format: {0}")]
UnsupportedFormat(String),
#[error("extraction failed: {0}")]
ExtractionFailed(String),
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
}
#[derive(Debug, Clone, Default)]
pub struct ExtractionOptions {
pub extract_metadata: bool,
pub strip_metadata: bool,
pub ocr: bool,
}
#[derive(Debug)]
pub struct ExtractedDocument {
pub text: Vec<u8>,
pub metadata: Option<MetadataReport>,
pub format: DetectedFormat,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DetectedFormat {
PlainText,
Docx,
Pdf,
Html,
Xlsx,
Pptx,
Email,
Unknown(String),
}
pub struct Extractor;
impl Extractor {
pub async fn extract(
path: &Path,
_opts: ExtractionOptions,
) -> Result<ExtractedDocument, ExtractError> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
match ext.as_str() {
"txt" | "text" => {
let text = tokio::fs::read(path).await?;
Ok(ExtractedDocument {
text,
metadata: None,
format: DetectedFormat::PlainText,
})
}
"docx" => Err(ExtractError::UnsupportedFormat(
"docx extraction requires Kreuzberg integration (TODO)".into(),
)),
"pdf" => Err(ExtractError::UnsupportedFormat(
"pdf extraction requires Kreuzberg integration (TODO)".into(),
)),
other => Err(ExtractError::UnsupportedFormat(other.to_owned())),
}
}
pub fn extract_bytes(
data: &[u8],
format: DetectedFormat,
_opts: ExtractionOptions,
) -> Result<ExtractedDocument, ExtractError> {
match format {
DetectedFormat::PlainText => Ok(ExtractedDocument {
text: data.to_vec(),
metadata: None,
format: DetectedFormat::PlainText,
}),
_ => Err(ExtractError::UnsupportedFormat(
"non-text extraction requires Kreuzberg integration (TODO)".into(),
)),
}
}
}