use orbok_core::{ErrorCategory, OrbokResult};
use orbok_fs::ValidatedPath;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SegmentKind {
Heading,
Paragraph,
CodeBlock,
ListItem,
Table,
Other,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum LocationQuality {
Exact,
Approximate,
PageOnly,
Unknown,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ExtractedSegment {
pub kind: SegmentKind,
pub text: String,
pub line_start: u32,
pub line_end: u32,
pub heading_path: Option<String>,
pub location_quality: LocationQuality,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ExtractOutput {
pub extractor_name: String,
pub extractor_version: String,
pub normalization_version: String,
pub segments: Vec<ExtractedSegment>,
pub char_count: u64,
}
pub trait DocumentExtractor: Send + Sync {
fn name(&self) -> &'static str;
fn version(&self) -> &'static str;
fn supported_extensions(&self) -> &'static [&'static str];
fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput>;
}
pub fn read_error_category(e: &std::io::Error) -> ErrorCategory {
match e.kind() {
std::io::ErrorKind::PermissionDenied => ErrorCategory::PermissionDenied,
std::io::ErrorKind::NotFound => ErrorCategory::SourceMissing,
_ => ErrorCategory::ReadError,
}
}