use serde::{Deserialize, Serialize};
pub const SCHEMA_VERSION: &str = "dongler.ir.v1";
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Document {
#[serde(default = "default_schema_version")]
pub schema_version: String,
pub metadata: Metadata,
pub pages: Vec<Page>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub assets: Vec<Asset>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub warnings: Vec<Warning>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Page {
pub number: usize,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub width: Option<f32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub height: Option<f32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rotation: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
pub blocks: Vec<Block>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub images: Vec<ImageObject>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub assets: Vec<Asset>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub warnings: Vec<Warning>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum Block {
Text(TextBlock),
Table(TableBlock),
Figure(FigureBlock),
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct TextBlock {
pub text: String,
pub kind: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub lines: Vec<Line>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub source_anchors: Vec<SourceAnchor>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub confidence: Option<Confidence>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct TableBlock {
pub headers: Vec<String>,
pub rows: Vec<Vec<String>>,
pub caption: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub cells: Vec<TableCell>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub source_anchors: Vec<SourceAnchor>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub confidence: Option<Confidence>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct FigureBlock {
pub alt_text: Option<String>,
pub caption: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub image_ref: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub source_anchors: Vec<SourceAnchor>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub confidence: Option<Confidence>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Metadata {
pub format: String,
pub engine: String,
pub source: Option<String>,
pub title: Option<String>,
pub character_count: usize,
pub word_count: usize,
pub block_count: usize,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub file_size_bytes: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pdf_version: Option<String>,
#[serde(default)]
pub encrypted: bool,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct BatchResult {
pub path: String,
pub ok: bool,
pub document: Option<Document>,
pub error: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ExtractOptions {
#[serde(default = "default_include_geometry")]
pub include_geometry: bool,
#[serde(default = "default_include_assets")]
pub include_assets: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_parallelism: Option<usize>,
#[serde(default)]
pub suppress_headers_footers: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub password: Option<String>,
}
impl Default for ExtractOptions {
fn default() -> Self {
Self {
include_geometry: true,
include_assets: true,
max_parallelism: None,
suppress_headers_footers: false,
password: None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct BBox {
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Line {
pub text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub spans: Vec<Span>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Span {
pub text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub font: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub size: Option<f32>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct TableCell {
pub row: usize,
pub column: usize,
pub text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default)]
pub is_header: bool,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct SourceAnchor {
pub page_number: usize,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub pdf_object_ids: Vec<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
pub extraction_method: String,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Confidence {
pub score: f32,
#[serde(default)]
pub calibrated: bool,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Warning {
pub code: String,
pub severity: String,
pub message: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub source_anchor: Option<SourceAnchor>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Asset {
pub id: String,
pub kind: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub object_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub width: Option<u32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub height: Option<u32>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ImageObject {
pub id: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub object_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bbox: Option<BBox>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub width: Option<u32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub height: Option<u32>,
}
pub fn default_schema_version() -> String {
SCHEMA_VERSION.to_owned()
}
fn default_include_geometry() -> bool {
true
}
fn default_include_assets() -> bool {
true
}