use serde::Serialize;
#[derive(Serialize, Clone, Debug, Default)]
pub struct Document {
pub source: SourceKind,
pub metadata: Metadata,
pub pages: Vec<Page>,
pub outline: Vec<OutlineItem>,
pub warnings: Vec<Warning>,
}
#[derive(Serialize, Clone, Debug, Default, PartialEq, Eq)]
pub enum SourceKind {
#[default]
Pdf,
Docx,
}
#[derive(Serialize, Clone, Debug, Default)]
pub struct Metadata {
pub title: Option<String>,
pub author: Option<String>,
pub subject: Option<String>,
pub keywords: Option<String>,
pub creator: Option<String>,
pub producer: Option<String>,
pub page_count: u32,
}
#[derive(Serialize, Clone, Debug, Default)]
pub struct Page {
pub index: u32,
pub width: f32,
pub height: f32,
pub rotation: i32,
pub chars: Vec<Char>,
pub lines: Vec<TextLine>,
pub blocks: Vec<Block>,
pub rects: Vec<Rect>,
pub rules: Vec<Rule>,
pub images: Vec<ImageRef>,
pub links: Vec<Link>,
}
#[derive(Serialize, Clone, Debug)]
pub struct Char {
pub text: String,
pub bbox: BBox,
pub font: FontRef,
pub size: f32,
pub color: Option<[f32; 3]>,
}
#[derive(Serialize, Clone, Copy, Debug, Default, PartialEq)]
pub struct BBox {
pub x0: f32,
pub y0: f32,
pub x1: f32,
pub y1: f32,
}
#[derive(Serialize, Clone, Debug, Default)]
pub struct FontRef {
pub name: String,
}
#[derive(Serialize, Clone, Debug)]
pub struct TextLine {
pub bbox: BBox,
pub text: String,
pub chars: Vec<u32>,
}
#[derive(Serialize, Clone, Debug)]
pub enum Block {
Paragraph(Paragraph),
Table(Table),
Image(ImageRef),
}
#[derive(Serialize, Clone, Debug)]
pub struct Paragraph {
pub bbox: BBox,
pub text: String,
pub heading_level: Option<u8>,
#[serde(skip_serializing_if = "Option::is_none")]
pub role: Option<BlockRole>,
}
#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
pub enum BlockRole {
HeaderFooter,
}
#[derive(Serialize, Clone, Debug)]
pub struct Table {
pub bbox: BBox,
pub rows: Vec<Vec<Cell>>,
pub source: TableSource,
}
#[derive(Serialize, Clone, Debug, PartialEq, Eq)]
pub enum TableSource {
Ruled,
Whitespace,
Docx,
}
#[derive(Serialize, Clone, Debug)]
pub struct Cell {
pub text: String,
pub bbox: BBox,
pub row_span: u16,
pub col_span: u16,
}
#[derive(Serialize, Clone, Debug)]
pub struct Rect {
pub bbox: BBox,
}
#[derive(Serialize, Clone, Debug)]
pub struct Rule {
pub x0: f32,
pub y0: f32,
pub x1: f32,
pub y1: f32,
pub width: f32,
}
#[derive(Serialize, Clone, Debug)]
pub struct ImageRef {
pub id: String,
pub bbox: BBox,
pub width: u32,
pub height: u32,
}
#[derive(Serialize, Clone, Debug)]
pub struct Link {
pub bbox: BBox,
pub uri: Option<String>,
pub page: Option<u32>,
}
#[derive(Serialize, Clone, Debug)]
pub struct OutlineItem {
pub title: String,
pub page: Option<u32>,
pub level: u8,
pub children: Vec<OutlineItem>,
}
#[derive(Serialize, Clone, Debug)]
pub struct Warning {
pub page: Option<u32>,
pub kind: WarningKind,
pub detail: String,
}
#[derive(Serialize, Clone, Debug, PartialEq, Eq)]
pub enum WarningKind {
MalformedObject,
MissingCMap,
EncryptedFallback,
NeedsOcr,
Unsupported,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn document_serializes_to_stable_json() {
let doc = Document {
source: SourceKind::Pdf,
metadata: Metadata { page_count: 1, ..Default::default() },
pages: vec![Page {
index: 0,
width: 612.0,
height: 792.0,
chars: vec![Char {
text: "A".to_string(),
bbox: BBox { x0: 0.0, y0: 0.0, x1: 10.0, y1: 12.0 },
font: FontRef { name: "Helvetica".to_string() },
size: 12.0,
color: None,
}],
..Default::default()
}],
warnings: vec![Warning {
page: Some(0),
kind: WarningKind::MissingCMap,
detail: "font F1 has no ToUnicode".to_string(),
}],
..Default::default()
};
let json = serde_json::to_string(&doc).expect("IR serializes");
assert_eq!(json, serde_json::to_string(&doc.clone()).unwrap());
assert!(json.contains("\"source\":\"Pdf\""));
assert!(json.contains("\"kind\":\"MissingCMap\""));
assert!(json.contains("\"text\":\"A\""));
}
}