Skip to main content

orbok_extract/
types.rs

1//! Extraction types (RFC-005 §6–§8; RFC-044 hardening).
2
3use orbok_core::{ErrorCategory, OrbokResult};
4use orbok_fs::ValidatedPath;
5use serde::{Deserialize, Serialize};
6
7// ── Location semantics ──────────────────────────────────────────────────
8
9/// What the position fields (`line_start` / `line_end`) on a segment
10/// actually mean in the source format (RFC-044 §12).
11///
12/// The UI must use this field before deciding how to label a location —
13/// never assume "line" for all formats.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
15#[serde(rename_all = "snake_case")]
16pub enum LocationKind {
17    /// Position fields are 1-based line numbers.
18    Lines,
19    /// Position fields are 1-based page numbers.
20    Pages,
21    /// Position fields are 1-based paragraph indices.
22    Paragraphs,
23    /// Position fields are approximate block indices.
24    Blocks,
25    /// Position meaning is unknown or not applicable.
26    Unknown,
27}
28
29// ── Segment classification ──────────────────────────────────────────────
30
31/// Segment classification (RFC-005 §8; feeds RFC-006 chunking).
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum SegmentKind {
35    Heading,
36    Paragraph,
37    CodeBlock,
38    ListItem,
39    Table,
40    Other,
41}
42
43/// How precise the recorded location is (RFC-006 §8 vocabulary, shared
44/// here because extraction produces the locations).
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum LocationQuality {
48    Exact,
49    Approximate,
50    PageOnly,
51    Unknown,
52}
53
54// ── Resource limits ─────────────────────────────────────────────────────
55
56/// Per-extraction resource limits (RFC-044 §9).
57///
58/// Conservative defaults keep extraction bounded on any machine.
59/// Values are configurable by the app layer; do not hard-code them
60/// in extractors.
61#[derive(Debug, Clone)]
62pub struct ExtractLimits {
63    /// Maximum file size to read at all.
64    pub max_file_bytes: u64,
65    /// Maximum total extracted characters across all segments.
66    pub max_extracted_chars: u64,
67    /// Maximum number of segments to produce.
68    pub max_segments: usize,
69    /// Maximum PDF pages to process.
70    pub max_pdf_pages: usize,
71    /// Maximum uncompressed size of a single DOCX ZIP entry.
72    pub max_docx_xml_bytes: u64,
73    /// Maximum uncompressed size of any ZIP entry.
74    pub max_zip_entry_bytes: u64,
75    /// Maximum HTML file size.
76    pub max_html_bytes: u64,
77}
78
79impl Default for ExtractLimits {
80    fn default() -> Self {
81        Self {
82            max_file_bytes: 64 * 1024 * 1024, // 64 MiB
83            max_extracted_chars: 5_000_000,
84            max_segments: 20_000,
85            max_pdf_pages: 1_000,
86            max_docx_xml_bytes: 32 * 1024 * 1024,  // 32 MiB
87            max_zip_entry_bytes: 64 * 1024 * 1024, // 64 MiB
88            max_html_bytes: 32 * 1024 * 1024,      // 32 MiB
89        }
90    }
91}
92
93/// Context passed into every extractor call (RFC-044 §9.3).
94#[derive(Debug, Clone)]
95pub struct ExtractContext {
96    pub limits: ExtractLimits,
97}
98
99impl Default for ExtractContext {
100    fn default() -> Self {
101        Self {
102            limits: ExtractLimits::default(),
103        }
104    }
105}
106
107// ── Structured warnings ─────────────────────────────────────────────────
108
109/// Warnings about partial or degraded extraction (RFC-044 §10).
110///
111/// A non-empty `warnings` list means the output is honest but incomplete.
112/// The UI maps these to plain-language messages; raw variant names must
113/// not appear in default user-facing copy.
114#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
115#[serde(rename_all = "snake_case", tag = "kind")]
116pub enum ExtractWarning {
117    /// Generic content was skipped; `reason` is for logs only.
118    SomeContentSkipped { reason: String },
119    /// These PDF pages could not be read.
120    SomePagesUnreadable { pages: Vec<u32> },
121    /// PDF has pages but no extractable text — likely scanned.
122    PossiblyScannedPdf,
123    /// A resource limit stopped extraction early.
124    SizeLimitReached { limit_name: String },
125    /// File uses an encoding orbok could not read.
126    EncodingUnsupported,
127    /// A document part (e.g. footnotes, embedded object) was skipped.
128    UnsupportedDocumentPart { part: String },
129    /// Location fields are approximate, not exact.
130    ApproximateLocationOnly,
131    /// Malformed content was partially recovered and included.
132    MalformedContentRecovered,
133}
134
135// ── Core segment and output types ───────────────────────────────────────
136
137/// One extracted, normalized segment with source location.
138#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
139pub struct ExtractedSegment {
140    pub kind: SegmentKind,
141    /// Normalized text (norm-v1).
142    pub text: String,
143    /// 1-based inclusive position range; meaning depends on `location_kind`.
144    pub line_start: u32,
145    pub line_end: u32,
146    /// What the position fields represent for this format (RFC-044 §12).
147    pub location_kind: LocationKind,
148    /// Heading trail ("Guide > Install > Linux"), when structure exists.
149    pub heading_path: Option<String>,
150    pub location_quality: LocationQuality,
151}
152
153/// Extraction result for one file (RFC-005 §7; RFC-044 §10.3).
154///
155/// This payload is cached under the `extract-segments:v1` namespace
156/// (Appendix A §7). Adding `warnings` is backward-compatible: existing
157/// cache payloads deserialize with an empty warnings vec via the
158/// `#[serde(default)]` attribute.
159#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
160pub struct ExtractOutput {
161    pub extractor_name: String,
162    pub extractor_version: String,
163    pub normalization_version: String,
164    pub segments: Vec<ExtractedSegment>,
165    pub char_count: u64,
166    /// Structured warnings about partial or degraded extraction.
167    /// Empty means the file was fully and cleanly processed.
168    #[serde(default)]
169    pub warnings: Vec<ExtractWarning>,
170}
171
172// ── Neutral chunk type (RFC-044 §14 Option B) ───────────────────────────
173
174/// A chunk ready for the pipeline, with no dependency on `orbok-db`.
175///
176/// The pipeline layer (`orbok-workers`) maps this to
177/// `orbok_db::repo::ChunkSpec`. This keeps `orbok-extract` free of any
178/// database dependency (RFC-044 §14.6).
179#[derive(Debug, Clone, PartialEq)]
180pub struct ExtractedChunk {
181    pub chunk_kind: &'static str,
182    pub chunk_ordinal: u32,
183    pub heading_path: Option<String>,
184    pub title: Option<String>,
185    pub normalized_text: String,
186    pub location_kind: LocationKind,
187    pub line_start: u32,
188    pub line_end: u32,
189    pub byte_start: Option<u64>,
190    pub byte_end: Option<u64>,
191    pub location_quality: &'static str,
192    pub parent_idx: Option<usize>,
193}
194
195// ── DocumentExtractor trait ─────────────────────────────────────────────
196
197/// A document extractor (RFC-005 §6; RFC-044 §9.3 / §18.4).
198///
199/// Implementations must:
200/// - read only through the [`ValidatedPath`] they are given;
201/// - honour the limits in [`ExtractContext`];
202/// - return typed failure categories, never panic on malformed input;
203/// - populate `location_kind` correctly for their format.
204pub trait DocumentExtractor: Send + Sync {
205    /// Stable name recorded in `extraction_records.extractor_name`.
206    fn name(&self) -> &'static str;
207    /// Version recorded for staleness detection (RFC-005 §9).
208    fn version(&self) -> &'static str;
209    /// Extensions (lowercase, no dot) this extractor handles.
210    fn supported_extensions(&self) -> &'static [&'static str];
211
212    /// Extract and normalize, honoring resource limits.
213    ///
214    /// This is the primary entry point. The default implementation
215    /// delegates to [`extract_legacy`] for backward compatibility during
216    /// the migration period; built-in extractors override this directly.
217    fn extract_with_context(
218        &self,
219        path: &ValidatedPath,
220        context: &ExtractContext,
221    ) -> OrbokResult<ExtractOutput> {
222        // Default: forward to the legacy signature.
223        // Remove once all built-in extractors are migrated.
224        let _ = context; // context used by overrides
225        self.extract(path)
226    }
227
228    /// Legacy entry point (no limits). Kept for the migration period;
229    /// callers should prefer [`extract_with_context`].
230    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput>;
231}
232
233// ── Helper ──────────────────────────────────────────────────────────────
234
235/// Classify a read failure (RFC-005 §13).
236pub fn read_error_category(e: &std::io::Error) -> ErrorCategory {
237    match e.kind() {
238        std::io::ErrorKind::PermissionDenied => ErrorCategory::PermissionDenied,
239        std::io::ErrorKind::NotFound => ErrorCategory::SourceMissing,
240        _ => ErrorCategory::ReadError,
241    }
242}