use orbok_core::{ErrorCategory, OrbokResult};
use orbok_fs::ValidatedPath;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum LocationKind {
Lines,
Pages,
Paragraphs,
Blocks,
Unknown,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SegmentKind {
Heading,
Paragraph,
CodeBlock,
ListItem,
Table,
Other,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum LocationQuality {
Exact,
Approximate,
PageOnly,
Unknown,
}
#[derive(Debug, Clone)]
pub struct ExtractLimits {
pub max_file_bytes: u64,
pub max_extracted_chars: u64,
pub max_segments: usize,
pub max_pdf_pages: usize,
pub max_docx_xml_bytes: u64,
pub max_zip_entry_bytes: u64,
pub max_html_bytes: u64,
}
impl Default for ExtractLimits {
fn default() -> Self {
Self {
max_file_bytes: 64 * 1024 * 1024, max_extracted_chars: 5_000_000,
max_segments: 20_000,
max_pdf_pages: 1_000,
max_docx_xml_bytes: 32 * 1024 * 1024, max_zip_entry_bytes: 64 * 1024 * 1024, max_html_bytes: 32 * 1024 * 1024, }
}
}
#[derive(Debug, Clone)]
pub struct ExtractContext {
pub limits: ExtractLimits,
}
impl Default for ExtractContext {
fn default() -> Self {
Self {
limits: ExtractLimits::default(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case", tag = "kind")]
pub enum ExtractWarning {
SomeContentSkipped { reason: String },
SomePagesUnreadable { pages: Vec<u32> },
PossiblyScannedPdf,
SizeLimitReached { limit_name: String },
EncodingUnsupported,
UnsupportedDocumentPart { part: String },
ApproximateLocationOnly,
MalformedContentRecovered,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ExtractedSegment {
pub kind: SegmentKind,
pub text: String,
pub line_start: u32,
pub line_end: u32,
pub location_kind: LocationKind,
pub heading_path: Option<String>,
pub location_quality: LocationQuality,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ExtractOutput {
pub extractor_name: String,
pub extractor_version: String,
pub normalization_version: String,
pub segments: Vec<ExtractedSegment>,
pub char_count: u64,
#[serde(default)]
pub warnings: Vec<ExtractWarning>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct ExtractedChunk {
pub chunk_kind: &'static str,
pub chunk_ordinal: u32,
pub heading_path: Option<String>,
pub title: Option<String>,
pub normalized_text: String,
pub location_kind: LocationKind,
pub line_start: u32,
pub line_end: u32,
pub byte_start: Option<u64>,
pub byte_end: Option<u64>,
pub location_quality: &'static str,
pub parent_idx: Option<usize>,
}
pub trait DocumentExtractor: Send + Sync {
fn name(&self) -> &'static str;
fn version(&self) -> &'static str;
fn supported_extensions(&self) -> &'static [&'static str];
fn extract_with_context(
&self,
path: &ValidatedPath,
context: &ExtractContext,
) -> OrbokResult<ExtractOutput> {
let _ = context; self.extract(path)
}
fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput>;
}
pub fn read_error_category(e: &std::io::Error) -> ErrorCategory {
match e.kind() {
std::io::ErrorKind::PermissionDenied => ErrorCategory::PermissionDenied,
std::io::ErrorKind::NotFound => ErrorCategory::SourceMissing,
_ => ErrorCategory::ReadError,
}
}