use serde::{Deserialize, Serialize};
pub const SCHEMA_VERSION: &str = "dongler.ir.v2";
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum Route {
BornDigital,
Scanned,
Hybrid,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum TextSource {
TextLayer,
Ocr,
Vlm,
Heuristic,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Provenance {
pub text_source: TextSource,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub detector: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub confidence: Option<f32>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BlockKind {
Heading(u8),
Paragraph,
ListItem,
Code,
Formula,
Caption,
PageHeader,
PageFooter,
Footnote,
}
impl BlockKind {
pub fn as_str(&self) -> String {
match self {
BlockKind::Heading(level) => format!("heading_{}", (*level).clamp(1, 6)),
BlockKind::Paragraph => "paragraph".to_owned(),
BlockKind::ListItem => "list_item".to_owned(),
BlockKind::Code => "code".to_owned(),
BlockKind::Formula => "formula".to_owned(),
BlockKind::Caption => "caption".to_owned(),
BlockKind::PageHeader => "page_header".to_owned(),
BlockKind::PageFooter => "page_footer".to_owned(),
BlockKind::Footnote => "footnote".to_owned(),
}
}
pub fn parse(kind: &str) -> BlockKind {
if let Some(rest) = kind.strip_prefix("heading_") {
if let Ok(level) = rest.parse::<u8>() {
return BlockKind::Heading(level.clamp(1, 6));
}
}
match kind {
"heading" | "title" => BlockKind::Heading(1),
"list" | "list_item" => BlockKind::ListItem,
"code" => BlockKind::Code,
"formula" | "equation" => BlockKind::Formula,
"caption" => BlockKind::Caption,
"page_header" | "header" => BlockKind::PageHeader,
"page_footer" | "footer" => BlockKind::PageFooter,
"footnote" => BlockKind::Footnote,
_ => BlockKind::Paragraph,
}
}
pub fn heading_level(&self) -> Option<u8> {
match self {
BlockKind::Heading(level) => Some(*level),
_ => None,
}
}
pub fn is_page_furniture(&self) -> bool {
matches!(self, BlockKind::PageHeader | BlockKind::PageFooter)
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Document {
#[serde(default = "default_schema_version")]
pub schema_version: String,
pub metadata: Metadata,
pub pages: Vec<Page>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub assets: Vec<Asset>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub warnings: Vec<Warning>,
}
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct Page {
pub number: usize,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub width: Option<f32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub height: Option<f32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rotation: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub route: Option<Route>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
pub blocks: Vec<Block>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub images: Vec<ImageObject>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub assets: Vec<Asset>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub warnings: Vec<Warning>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum Block {
Text(TextBlock),
Table(TableBlock),
Figure(FigureBlock),
}
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct TextBlock {
pub text: String,
pub kind: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub lines: Vec<Line>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub source_anchors: Vec<SourceAnchor>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub confidence: Option<Confidence>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub provenance: Option<Provenance>,
}
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct TableBlock {
pub headers: Vec<String>,
pub rows: Vec<Vec<String>>,
pub caption: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub cells: Vec<TableCell>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub html: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub source_anchors: Vec<SourceAnchor>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub confidence: Option<Confidence>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub provenance: Option<Provenance>,
}
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct FigureBlock {
pub alt_text: Option<String>,
pub caption: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub image_ref: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub source_anchors: Vec<SourceAnchor>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub confidence: Option<Confidence>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub provenance: Option<Provenance>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Metadata {
pub format: String,
pub engine: String,
pub source: Option<String>,
pub title: Option<String>,
pub character_count: usize,
pub word_count: usize,
pub block_count: usize,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub file_size_bytes: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pdf_version: Option<String>,
#[serde(default)]
pub encrypted: bool,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct BatchResult {
pub path: String,
pub ok: bool,
pub document: Option<Document>,
pub error: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ExtractOptions {
#[serde(default = "default_include_geometry")]
pub include_geometry: bool,
#[serde(default = "default_include_assets")]
pub include_assets: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_parallelism: Option<usize>,
#[serde(default)]
pub suppress_headers_footers: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub password: Option<String>,
}
impl Default for ExtractOptions {
fn default() -> Self {
Self {
include_geometry: true,
include_assets: true,
max_parallelism: None,
suppress_headers_footers: false,
password: None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct BBox {
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Line {
pub text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub spans: Vec<Span>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Span {
pub text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub font: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub size: Option<f32>,
#[serde(default, skip_serializing_if = "is_false")]
pub bold: bool,
#[serde(default, skip_serializing_if = "is_false")]
pub italic: bool,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct TableCell {
pub row: usize,
pub column: usize,
pub text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default)]
pub is_header: bool,
#[serde(default = "one", skip_serializing_if = "is_one")]
pub col_span: usize,
#[serde(default = "one", skip_serializing_if = "is_one")]
pub row_span: usize,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct SourceAnchor {
pub page_number: usize,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub pdf_object_ids: Vec<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
pub extraction_method: String,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Confidence {
pub score: f32,
#[serde(default)]
pub calibrated: bool,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Warning {
pub code: String,
pub severity: String,
pub message: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub source_anchor: Option<SourceAnchor>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Asset {
pub id: String,
pub kind: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub object_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub width: Option<u32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub height: Option<u32>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ImageObject {
pub id: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub object_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub width: Option<u32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub height: Option<u32>,
}
pub fn default_schema_version() -> String {
SCHEMA_VERSION.to_owned()
}
fn default_include_geometry() -> bool {
true
}
fn default_include_assets() -> bool {
true
}
fn one() -> usize {
1
}
fn is_one(value: &usize) -> bool {
*value == 1
}
fn is_false(value: &bool) -> bool {
!*value
}