orbok_extract/types.rs
1//! Extraction types (RFC-005 §6–§8; RFC-044 hardening).
2
3use orbok_core::{ErrorCategory, OrbokResult};
4use orbok_fs::ValidatedPath;
5use serde::{Deserialize, Serialize};
6
7// ── Location semantics ──────────────────────────────────────────────────
8
9/// What the position fields (`line_start` / `line_end`) on a segment
10/// actually mean in the source format (RFC-044 §12).
11///
12/// The UI must use this field before deciding how to label a location —
13/// never assume "line" for all formats.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
15#[serde(rename_all = "snake_case")]
16pub enum LocationKind {
17 /// Position fields are 1-based line numbers.
18 Lines,
19 /// Position fields are 1-based page numbers.
20 Pages,
21 /// Position fields are 1-based paragraph indices.
22 Paragraphs,
23 /// Position fields are approximate block indices.
24 Blocks,
25 /// Position meaning is unknown or not applicable.
26 Unknown,
27}
28
29// ── Segment classification ──────────────────────────────────────────────
30
31/// Segment classification (RFC-005 §8; feeds RFC-006 chunking).
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum SegmentKind {
35 Heading,
36 Paragraph,
37 CodeBlock,
38 ListItem,
39 Table,
40 Other,
41}
42
43/// How precise the recorded location is (RFC-006 §8 vocabulary, shared
44/// here because extraction produces the locations).
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum LocationQuality {
48 Exact,
49 Approximate,
50 PageOnly,
51 Unknown,
52}
53
54// ── Resource limits ─────────────────────────────────────────────────────
55
56/// Per-extraction resource limits (RFC-044 §9).
57///
58/// Conservative defaults keep extraction bounded on any machine.
59/// Values are configurable by the app layer; do not hard-code them
60/// in extractors.
61#[derive(Debug, Clone)]
62pub struct ExtractLimits {
63 /// Maximum file size to read at all.
64 pub max_file_bytes: u64,
65 /// Maximum total extracted characters across all segments.
66 pub max_extracted_chars: u64,
67 /// Maximum number of segments to produce.
68 pub max_segments: usize,
69 /// Maximum PDF pages to process.
70 pub max_pdf_pages: usize,
71 /// Maximum uncompressed size of a single DOCX ZIP entry.
72 pub max_docx_xml_bytes: u64,
73 /// Maximum uncompressed size of any ZIP entry.
74 pub max_zip_entry_bytes: u64,
75 /// Maximum HTML file size.
76 pub max_html_bytes: u64,
77}
78
79impl Default for ExtractLimits {
80 fn default() -> Self {
81 Self {
82 max_file_bytes: 64 * 1024 * 1024, // 64 MiB
83 max_extracted_chars: 5_000_000,
84 max_segments: 20_000,
85 max_pdf_pages: 1_000,
86 max_docx_xml_bytes: 32 * 1024 * 1024, // 32 MiB
87 max_zip_entry_bytes: 64 * 1024 * 1024, // 64 MiB
88 max_html_bytes: 32 * 1024 * 1024, // 32 MiB
89 }
90 }
91}
92
93/// Context passed into every extractor call (RFC-044 §9.3).
94#[derive(Debug, Clone)]
95pub struct ExtractContext {
96 pub limits: ExtractLimits,
97}
98
99impl Default for ExtractContext {
100 fn default() -> Self {
101 Self {
102 limits: ExtractLimits::default(),
103 }
104 }
105}
106
107// ── Structured warnings ─────────────────────────────────────────────────
108
109/// Warnings about partial or degraded extraction (RFC-044 §10).
110///
111/// A non-empty `warnings` list means the output is honest but incomplete.
112/// The UI maps these to plain-language messages; raw variant names must
113/// not appear in default user-facing copy.
114#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
115#[serde(rename_all = "snake_case", tag = "kind")]
116pub enum ExtractWarning {
117 /// Generic content was skipped; `reason` is for logs only.
118 SomeContentSkipped { reason: String },
119 /// These PDF pages could not be read.
120 SomePagesUnreadable { pages: Vec<u32> },
121 /// PDF has pages but no extractable text — likely scanned.
122 PossiblyScannedPdf,
123 /// A resource limit stopped extraction early.
124 SizeLimitReached { limit_name: String },
125 /// File uses an encoding orbok could not read.
126 EncodingUnsupported,
127 /// A document part (e.g. footnotes, embedded object) was skipped.
128 UnsupportedDocumentPart { part: String },
129 /// Location fields are approximate, not exact.
130 ApproximateLocationOnly,
131 /// Malformed content was partially recovered and included.
132 MalformedContentRecovered,
133}
134
135// ── Core segment and output types ───────────────────────────────────────
136
137/// One extracted, normalized segment with source location.
138#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
139pub struct ExtractedSegment {
140 pub kind: SegmentKind,
141 /// Normalized text (norm-v1).
142 pub text: String,
143 /// 1-based inclusive position range; meaning depends on `location_kind`.
144 pub line_start: u32,
145 pub line_end: u32,
146 /// What the position fields represent for this format (RFC-044 §12).
147 pub location_kind: LocationKind,
148 /// Heading trail ("Guide > Install > Linux"), when structure exists.
149 pub heading_path: Option<String>,
150 pub location_quality: LocationQuality,
151}
152
153/// Extraction result for one file (RFC-005 §7; RFC-044 §10.3).
154///
155/// This payload is cached under the `extract-segments:v1` namespace
156/// (Appendix A §7). Adding `warnings` is backward-compatible: existing
157/// cache payloads deserialize with an empty warnings vec via the
158/// `#[serde(default)]` attribute.
159#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
160pub struct ExtractOutput {
161 pub extractor_name: String,
162 pub extractor_version: String,
163 pub normalization_version: String,
164 pub segments: Vec<ExtractedSegment>,
165 pub char_count: u64,
166 /// Structured warnings about partial or degraded extraction.
167 /// Empty means the file was fully and cleanly processed.
168 #[serde(default)]
169 pub warnings: Vec<ExtractWarning>,
170}
171
172// ── Neutral chunk type (RFC-044 §14 Option B) ───────────────────────────
173
174/// A chunk ready for the pipeline, with no dependency on `orbok-db`.
175///
176/// The pipeline layer (`orbok-workers`) maps this to
177/// `orbok_db::repo::ChunkSpec`. This keeps `orbok-extract` free of any
178/// database dependency (RFC-044 §14.6).
179#[derive(Debug, Clone, PartialEq)]
180pub struct ExtractedChunk {
181 pub chunk_kind: &'static str,
182 pub chunk_ordinal: u32,
183 pub heading_path: Option<String>,
184 pub title: Option<String>,
185 pub normalized_text: String,
186 pub location_kind: LocationKind,
187 pub line_start: u32,
188 pub line_end: u32,
189 pub byte_start: Option<u64>,
190 pub byte_end: Option<u64>,
191 pub location_quality: &'static str,
192 pub parent_idx: Option<usize>,
193}
194
195// ── DocumentExtractor trait ─────────────────────────────────────────────
196
197/// A document extractor (RFC-005 §6; RFC-044 §9.3 / §18.4).
198///
199/// Implementations must:
200/// - read only through the [`ValidatedPath`] they are given;
201/// - honour the limits in [`ExtractContext`];
202/// - return typed failure categories, never panic on malformed input;
203/// - populate `location_kind` correctly for their format.
204pub trait DocumentExtractor: Send + Sync {
205 /// Stable name recorded in `extraction_records.extractor_name`.
206 fn name(&self) -> &'static str;
207 /// Version recorded for staleness detection (RFC-005 §9).
208 fn version(&self) -> &'static str;
209 /// Extensions (lowercase, no dot) this extractor handles.
210 fn supported_extensions(&self) -> &'static [&'static str];
211
212 /// Extract and normalize, honoring resource limits.
213 ///
214 /// This is the primary entry point. The default implementation
215 /// delegates to [`extract_legacy`] for backward compatibility during
216 /// the migration period; built-in extractors override this directly.
217 fn extract_with_context(
218 &self,
219 path: &ValidatedPath,
220 context: &ExtractContext,
221 ) -> OrbokResult<ExtractOutput> {
222 // Default: forward to the legacy signature.
223 // Remove once all built-in extractors are migrated.
224 let _ = context; // context used by overrides
225 self.extract(path)
226 }
227
228 /// Legacy entry point (no limits). Kept for the migration period;
229 /// callers should prefer [`extract_with_context`].
230 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput>;
231}
232
233// ── Helper ──────────────────────────────────────────────────────────────
234
235/// Classify a read failure (RFC-005 §13).
236pub fn read_error_category(e: &std::io::Error) -> ErrorCategory {
237 match e.kind() {
238 std::io::ErrorKind::PermissionDenied => ErrorCategory::PermissionDenied,
239 std::io::ErrorKind::NotFound => ErrorCategory::SourceMissing,
240 _ => ErrorCategory::ReadError,
241 }
242}