use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::collections::HashMap;
use super::djot::DjotContent;
use super::document_structure::DocumentStructure;
use super::metadata::Metadata;
use super::ocr_elements::OcrElement;
use super::page::PageContent;
use super::tables::Table;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[cfg_attr(feature = "api", schema(no_recursion))]
pub struct ExtractionResult {
pub content: String,
#[cfg_attr(feature = "api", schema(value_type = String))]
pub mime_type: Cow<'static, str>,
pub metadata: Metadata,
pub tables: Vec<Table>,
#[serde(skip_serializing_if = "Option::is_none")]
pub detected_languages: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub chunks: Option<Vec<Chunk>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub images: Option<Vec<ExtractedImage>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub pages: Option<Vec<PageContent>>,
#[serde(skip_serializing_if = "Option::is_none", default)]
pub elements: Option<Vec<Element>>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub djot_content: Option<DjotContent>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub ocr_elements: Option<Vec<OcrElement>>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub document: Option<DocumentStructure>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
pub extracted_keywords: Option<Vec<crate::keywords::Keyword>>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub quality_score: Option<f64>,
#[serde(skip_serializing_if = "Vec::is_empty")]
#[serde(default)]
pub processing_warnings: Vec<ProcessingWarning>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub annotations: Option<Vec<super::annotations::PdfAnnotation>>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub children: Option<Vec<ArchiveEntry>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ArchiveEntry {
pub path: String,
pub mime_type: String,
pub result: Box<ExtractionResult>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ProcessingWarning {
#[cfg_attr(feature = "api", schema(value_type = String))]
pub source: Cow<'static, str>,
#[cfg_attr(feature = "api", schema(value_type = String))]
pub message: Cow<'static, str>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct Chunk {
pub content: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub embedding: Option<Vec<f32>>,
pub metadata: ChunkMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct HeadingContext {
pub headings: Vec<HeadingLevel>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct HeadingLevel {
pub level: u8,
pub text: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ChunkMetadata {
pub byte_start: usize,
pub byte_end: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub token_count: Option<usize>,
pub chunk_index: usize,
pub total_chunks: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub first_page: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_page: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub heading_context: Option<HeadingContext>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ExtractedImage {
#[cfg_attr(feature = "api", schema(value_type = Vec<u8>, format = "binary"))]
pub data: Bytes,
#[cfg_attr(feature = "api", schema(value_type = String))]
pub format: Cow<'static, str>,
pub image_index: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub page_number: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub width: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub height: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub colorspace: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub bits_per_component: Option<u32>,
#[serde(default)]
pub is_mask: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
#[cfg_attr(feature = "api", schema(value_type = Option<ExtractionResult>))]
pub ocr_result: Option<Box<ExtractionResult>>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub bounding_box: Option<BoundingBox>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[serde(rename_all = "snake_case")]
pub enum OutputFormat {
#[default]
Unified,
ElementBased,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[cfg_attr(feature = "api", schema(value_type = String))]
pub struct ElementId(String);
impl ElementId {
pub fn new(hex_str: impl Into<String>) -> std::result::Result<Self, String> {
let s = hex_str.into();
if s.is_empty() {
return Err("ElementId cannot be empty".to_string());
}
Ok(ElementId(s))
}
}
impl AsRef<str> for ElementId {
fn as_ref(&self) -> &str {
&self.0
}
}
impl std::fmt::Display for ElementId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[serde(rename_all = "snake_case")]
pub enum ElementType {
Title,
NarrativeText,
Heading,
ListItem,
Table,
Image,
PageBreak,
CodeBlock,
BlockQuote,
Footer,
Header,
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct BoundingBox {
pub x0: f64,
pub y0: f64,
pub x1: f64,
pub y1: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ElementMetadata {
pub page_number: Option<usize>,
pub filename: Option<String>,
pub coordinates: Option<BoundingBox>,
pub element_index: Option<usize>,
pub additional: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct Element {
pub element_id: ElementId,
pub element_type: ElementType,
pub text: String,
pub metadata: ElementMetadata,
}