Skip to main content

orbok_extract/
types.rs

1//! Extraction types (RFC-005 §6–§8).
2
3use orbok_core::{ErrorCategory, OrbokResult};
4use orbok_fs::ValidatedPath;
5use serde::{Deserialize, Serialize};
6
7/// Segment classification (RFC-005 §8; feeds RFC-006 chunking).
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
9#[serde(rename_all = "snake_case")]
10pub enum SegmentKind {
11    Heading,
12    Paragraph,
13    CodeBlock,
14    ListItem,
15    Table,
16    Other,
17}
18
19/// How precise the recorded location is (RFC-006 §8 vocabulary, shared
20/// here because extraction produces the locations).
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
22#[serde(rename_all = "snake_case")]
23pub enum LocationQuality {
24    Exact,
25    Approximate,
26    PageOnly,
27    Unknown,
28}
29
30/// One extracted, normalized segment with source location.
31#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
32pub struct ExtractedSegment {
33    pub kind: SegmentKind,
34    /// Normalized text (norm-v1).
35    pub text: String,
36    /// 1-based inclusive line range in the source file.
37    pub line_start: u32,
38    pub line_end: u32,
39    /// Heading trail ("Guide > Install > Linux"), when structure exists.
40    pub heading_path: Option<String>,
41    pub location_quality: LocationQuality,
42}
43
44/// Extraction result for one file (RFC-005 §7). This payload is cached
45/// under the `extract-segments:v1` namespace (Appendix A §7).
46#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
47pub struct ExtractOutput {
48    pub extractor_name: String,
49    pub extractor_version: String,
50    pub normalization_version: String,
51    pub segments: Vec<ExtractedSegment>,
52    pub char_count: u64,
53}
54
55/// A document extractor (RFC-005 §6). Implementations must:
56/// - read only through the [`ValidatedPath`] they are given;
57/// - stream or bound memory (NFR-023);
58/// - return typed failure categories, never panic on malformed input.
59pub trait DocumentExtractor: Send + Sync {
60    /// Stable name recorded in `extraction_records.extractor_name`.
61    fn name(&self) -> &'static str;
62    /// Version recorded for staleness detection (RFC-005 §9).
63    fn version(&self) -> &'static str;
64    /// Extensions (lowercase, no dot) this extractor handles.
65    fn supported_extensions(&self) -> &'static [&'static str];
66    /// Extract and normalize.
67    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput>;
68}
69
70/// Helper for extractors: classify a read failure (RFC-005 §13).
71pub fn read_error_category(e: &std::io::Error) -> ErrorCategory {
72    match e.kind() {
73        std::io::ErrorKind::PermissionDenied => ErrorCategory::PermissionDenied,
74        std::io::ErrorKind::NotFound => ErrorCategory::SourceMissing,
75        _ => ErrorCategory::ReadError,
76    }
77}