orbok_extract/lib.rs
1//! # orbok-extract
2//!
3//! Text extraction (RFC-005): pluggable extractors turn boundary-
4//! validated source files into normalized, location-tagged segments.
5//! Extraction output is derived data — cacheable, rebuildable, never
6//! authoritative.
7//!
8//! RFC-044 hardening adds: resource limits (`ExtractLimits`), structured
9//! warnings (`ExtractWarning`), panic isolation (`extract_safely`),
10//! explicit location semantics (`LocationKind`), and removal of the
11//! `orbok-db` production dependency (chunker now produces
12//! `ExtractedChunk`; the pipeline layer maps to `ChunkSpec`).
13
14pub mod chunker;
15pub mod normalize;
16pub mod registry;
17pub mod types;
18
19pub mod docx;
20pub mod html;
21mod markdown;
22pub mod pdf;
23pub mod plugin;
24mod text;
25
26#[cfg(test)]
27mod tests;
28
29pub use chunker::chunk;
30pub use plugin::{PluginManifest, PluginRegistry};
31pub use registry::ExtractorRegistry;
32pub use types::{
33 DocumentExtractor, ExtractContext, ExtractLimits, ExtractOutput, ExtractWarning,
34 ExtractedChunk, ExtractedSegment, LocationKind, LocationQuality, SegmentKind,
35};