1use orbok_core::{ErrorCategory, OrbokResult};
4use orbok_fs::ValidatedPath;
5use serde::{Deserialize, Serialize};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
9#[serde(rename_all = "snake_case")]
10pub enum SegmentKind {
11 Heading,
12 Paragraph,
13 CodeBlock,
14 ListItem,
15 Table,
16 Other,
17}
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
22#[serde(rename_all = "snake_case")]
23pub enum LocationQuality {
24 Exact,
25 Approximate,
26 PageOnly,
27 Unknown,
28}
29
30#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
32pub struct ExtractedSegment {
33 pub kind: SegmentKind,
34 pub text: String,
36 pub line_start: u32,
38 pub line_end: u32,
39 pub heading_path: Option<String>,
41 pub location_quality: LocationQuality,
42}
43
44#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
47pub struct ExtractOutput {
48 pub extractor_name: String,
49 pub extractor_version: String,
50 pub normalization_version: String,
51 pub segments: Vec<ExtractedSegment>,
52 pub char_count: u64,
53}
54
55pub trait DocumentExtractor: Send + Sync {
60 fn name(&self) -> &'static str;
62 fn version(&self) -> &'static str;
64 fn supported_extensions(&self) -> &'static [&'static str];
66 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput>;
68}
69
70pub fn read_error_category(e: &std::io::Error) -> ErrorCategory {
72 match e.kind() {
73 std::io::ErrorKind::PermissionDenied => ErrorCategory::PermissionDenied,
74 std::io::ErrorKind::NotFound => ErrorCategory::SourceMissing,
75 _ => ErrorCategory::ReadError,
76 }
77}