use serde::{Deserialize, Serialize};
use super::extraction::BoundingBox;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[cfg_attr(feature = "api", schema(value_type = u32))]
pub struct NodeIndex(pub u32);
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[cfg_attr(feature = "api", schema(value_type = String))]
pub struct NodeId(String);
impl NodeId {
pub fn new(id: impl Into<String>) -> Self {
Self(id.into())
}
pub fn generate(node_type: &str, text: &str, page: Option<u32>, index: u32) -> Self {
let type_hash = node_type
.bytes()
.fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
let text_hash = text
.bytes()
.fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
let page_hash = page.map(|p| p as u64).unwrap_or(u64::MAX);
let combined = type_hash
.wrapping_mul(65599)
.wrapping_add(text_hash)
.wrapping_mul(65599)
.wrapping_add(page_hash)
.wrapping_mul(65599)
.wrapping_add(index as u64);
Self(format!("node-{:x}", combined))
}
}
impl AsRef<str> for NodeId {
fn as_ref(&self) -> &str {
&self.0
}
}
impl std::fmt::Display for NodeId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[cfg_attr(feature = "api", schema(no_recursion))]
pub struct DocumentStructure {
pub nodes: Vec<DocumentNode>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct DocumentNode {
pub id: NodeId,
pub content: NodeContent,
#[serde(skip_serializing_if = "Option::is_none")]
pub parent: Option<NodeIndex>,
#[serde(skip_serializing_if = "Vec::is_empty", default)]
pub children: Vec<NodeIndex>,
#[serde(default, skip_serializing_if = "ContentLayer::is_default")]
pub content_layer: ContentLayer,
#[serde(skip_serializing_if = "Option::is_none")]
pub page: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub page_end: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub bbox: Option<BoundingBox>,
#[serde(skip_serializing_if = "Vec::is_empty", default)]
pub annotations: Vec<TextAnnotation>,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[serde(rename_all = "snake_case")]
pub enum ContentLayer {
#[default]
Body,
Header,
Footer,
Footnote,
}
impl ContentLayer {
pub fn is_default(&self) -> bool {
*self == ContentLayer::Body
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[serde(tag = "node_type", rename_all = "snake_case")]
pub enum NodeContent {
Title { text: String },
Heading { level: u8, text: String },
Paragraph { text: String },
List { ordered: bool },
ListItem { text: String },
Table { grid: TableGrid },
Image {
#[serde(skip_serializing_if = "Option::is_none")]
description: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
image_index: Option<u32>,
},
Code {
text: String,
#[serde(skip_serializing_if = "Option::is_none")]
language: Option<String>,
},
Quote,
Formula { text: String },
Footnote { text: String },
Group {
#[serde(skip_serializing_if = "Option::is_none")]
label: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
heading_level: Option<u8>,
#[serde(skip_serializing_if = "Option::is_none")]
heading_text: Option<String>,
},
PageBreak,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct TableGrid {
pub rows: u32,
pub cols: u32,
pub cells: Vec<GridCell>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct GridCell {
pub content: String,
pub row: u32,
pub col: u32,
#[serde(default = "default_span")]
pub row_span: u32,
#[serde(default = "default_span")]
pub col_span: u32,
#[serde(default)]
pub is_header: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub bbox: Option<BoundingBox>,
}
fn default_span() -> u32 {
1
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct TextAnnotation {
pub start: u32,
pub end: u32,
pub kind: AnnotationKind,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
#[serde(tag = "annotation_type", rename_all = "snake_case")]
pub enum AnnotationKind {
Bold,
Italic,
Underline,
Strikethrough,
Code,
Subscript,
Superscript,
Link {
url: String,
#[serde(skip_serializing_if = "Option::is_none")]
title: Option<String>,
},
}
impl From<(f32, f32, f32, f32)> for BoundingBox {
fn from((left, top, right, bottom): (f32, f32, f32, f32)) -> Self {
BoundingBox {
x0: left as f64,
y0: top as f64,
x1: right as f64,
y1: bottom as f64,
}
}
}
impl NodeContent {
pub fn text(&self) -> Option<&str> {
match self {
NodeContent::Title { text }
| NodeContent::Heading { text, .. }
| NodeContent::Paragraph { text }
| NodeContent::ListItem { text }
| NodeContent::Code { text, .. }
| NodeContent::Formula { text }
| NodeContent::Footnote { text } => Some(text),
NodeContent::Table { .. }
| NodeContent::Image { .. }
| NodeContent::List { .. }
| NodeContent::Quote
| NodeContent::Group { .. }
| NodeContent::PageBreak => None,
}
}
pub fn node_type_str(&self) -> &'static str {
match self {
NodeContent::Title { .. } => "title",
NodeContent::Heading { .. } => "heading",
NodeContent::Paragraph { .. } => "paragraph",
NodeContent::List { .. } => "list",
NodeContent::ListItem { .. } => "list_item",
NodeContent::Table { .. } => "table",
NodeContent::Image { .. } => "image",
NodeContent::Code { .. } => "code",
NodeContent::Quote => "quote",
NodeContent::Formula { .. } => "formula",
NodeContent::Footnote { .. } => "footnote",
NodeContent::Group { .. } => "group",
NodeContent::PageBreak => "page_break",
}
}
}
impl DocumentStructure {
pub fn new() -> Self {
Self { nodes: Vec::new() }
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
nodes: Vec::with_capacity(capacity),
}
}
pub fn push_node(&mut self, node: DocumentNode) -> NodeIndex {
let idx = NodeIndex(self.nodes.len() as u32);
self.nodes.push(node);
idx
}
pub fn add_child(&mut self, parent: NodeIndex, child: NodeIndex) {
self.nodes[parent.0 as usize].children.push(child);
self.nodes[child.0 as usize].parent = Some(parent);
}
pub fn validate(&self) -> std::result::Result<(), String> {
let len = self.nodes.len() as u32;
for (i, node) in self.nodes.iter().enumerate() {
let idx = i as u32;
if let Some(parent) = node.parent {
if parent.0 >= len {
return Err(format!(
"Node {} has parent index {} which is out of bounds (len={})",
idx, parent.0, len
));
}
if !self.nodes[parent.0 as usize].children.contains(&NodeIndex(idx)) {
return Err(format!(
"Node {} claims parent {}, but parent's children list does not contain {}",
idx, parent.0, idx
));
}
}
for child in &node.children {
if child.0 >= len {
return Err(format!(
"Node {} has child index {} which is out of bounds (len={})",
idx, child.0, len
));
}
if self.nodes[child.0 as usize].parent != Some(NodeIndex(idx)) {
return Err(format!(
"Node {} lists child {}, but child's parent is {:?} instead of {}",
idx, child.0, self.nodes[child.0 as usize].parent, idx
));
}
}
}
Ok(())
}
pub fn body_roots(&self) -> impl Iterator<Item = (NodeIndex, &DocumentNode)> {
self.nodes.iter().enumerate().filter_map(|(i, node)| {
if node.parent.is_none() && node.content_layer == ContentLayer::Body {
Some((NodeIndex(i as u32), node))
} else {
None
}
})
}
pub fn furniture_roots(&self) -> impl Iterator<Item = (NodeIndex, &DocumentNode)> {
self.nodes.iter().enumerate().filter_map(|(i, node)| {
if node.parent.is_none() && node.content_layer != ContentLayer::Body {
Some((NodeIndex(i as u32), node))
} else {
None
}
})
}
pub fn get(&self, index: NodeIndex) -> Option<&DocumentNode> {
self.nodes.get(index.0 as usize)
}
pub fn len(&self) -> usize {
self.nodes.len()
}
pub fn is_empty(&self) -> bool {
self.nodes.is_empty()
}
}
impl Default for DocumentStructure {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_paragraph(text: &str, page: Option<u32>, index: u32) -> DocumentNode {
let content = NodeContent::Paragraph { text: text.to_string() };
DocumentNode {
id: NodeId::generate(content.node_type_str(), text, page, index),
content,
parent: None,
children: vec![],
content_layer: ContentLayer::Body,
page,
page_end: None,
bbox: None,
annotations: vec![],
}
}
#[test]
fn test_empty_document_validates() {
let doc = DocumentStructure::new();
assert!(doc.validate().is_ok());
assert!(doc.is_empty());
assert_eq!(doc.len(), 0);
}
#[test]
fn test_single_node_validates() {
let mut doc = DocumentStructure::new();
doc.push_node(make_paragraph("Hello world", Some(1), 0));
assert!(doc.validate().is_ok());
assert_eq!(doc.len(), 1);
}
#[test]
fn test_parent_child_relationship() {
let mut doc = DocumentStructure::new();
let group_content = NodeContent::Group {
label: None,
heading_level: Some(1),
heading_text: Some("Section 1".to_string()),
};
let group = DocumentNode {
id: NodeId::generate("group", "Section 1", Some(1), 0),
content: group_content,
parent: None,
children: vec![],
content_layer: ContentLayer::Body,
page: Some(1),
page_end: None,
bbox: None,
annotations: vec![],
};
let group_idx = doc.push_node(group);
let child = make_paragraph("Child paragraph", Some(1), 1);
let child_idx = doc.push_node(child);
doc.add_child(group_idx, child_idx);
assert!(doc.validate().is_ok());
assert_eq!(doc.nodes[0].children.len(), 1);
assert_eq!(doc.nodes[1].parent, Some(NodeIndex(0)));
}
#[test]
fn test_validation_catches_bad_parent() {
let mut doc = DocumentStructure::new();
let mut node = make_paragraph("Bad parent", Some(1), 0);
node.parent = Some(NodeIndex(99)); doc.push_node(node);
assert!(doc.validate().is_err());
}
#[test]
fn test_validation_catches_inconsistent_parent_child() {
let mut doc = DocumentStructure::new();
let parent = DocumentNode {
id: NodeId::generate("group", "", Some(1), 0),
content: NodeContent::Group {
label: None,
heading_level: None,
heading_text: None,
},
parent: None,
children: vec![], content_layer: ContentLayer::Body,
page: Some(1),
page_end: None,
bbox: None,
annotations: vec![],
};
doc.push_node(parent);
let mut child = make_paragraph("Orphan child", Some(1), 1);
child.parent = Some(NodeIndex(0));
doc.push_node(child);
assert!(doc.validate().is_err());
}
#[test]
fn test_validation_catches_bad_child() {
let mut doc = DocumentStructure::new();
let parent = DocumentNode {
id: NodeId::generate("group", "", Some(1), 0),
content: NodeContent::Group {
label: None,
heading_level: None,
heading_text: None,
},
parent: None,
children: vec![NodeIndex(99)], content_layer: ContentLayer::Body,
page: Some(1),
page_end: None,
bbox: None,
annotations: vec![],
};
doc.push_node(parent);
assert!(doc.validate().is_err());
}
#[test]
fn test_body_and_furniture_roots() {
let mut doc = DocumentStructure::new();
doc.push_node(make_paragraph("Body content", Some(1), 0));
let mut header = make_paragraph("Page header", Some(1), 1);
header.content_layer = ContentLayer::Header;
doc.push_node(header);
let mut footer = make_paragraph("Page footer", Some(1), 2);
footer.content_layer = ContentLayer::Footer;
doc.push_node(footer);
assert!(doc.validate().is_ok());
let body: Vec<_> = doc.body_roots().collect();
assert_eq!(body.len(), 1);
let furniture: Vec<_> = doc.furniture_roots().collect();
assert_eq!(furniture.len(), 2);
}
#[test]
fn test_node_id_deterministic() {
let id1 = NodeId::generate("paragraph", "Hello world", Some(1), 0);
let id2 = NodeId::generate("paragraph", "Hello world", Some(1), 0);
assert_eq!(id1, id2);
let id3 = NodeId::generate("paragraph", "Different text", Some(1), 0);
assert_ne!(id1, id3);
let id4 = NodeId::generate("paragraph", "Hello world", Some(2), 0);
assert_ne!(id1, id4);
let id5 = NodeId::generate("heading", "Hello world", Some(1), 0);
assert_ne!(id1, id5);
let id6 = NodeId::generate("paragraph", "Hello world", Some(1), 1);
assert_ne!(id1, id6);
let id_none = NodeId::generate("paragraph", "Hello world", None, 0);
let id_some_0 = NodeId::generate("paragraph", "Hello world", Some(0), 0);
assert_ne!(id_none, id_some_0);
}
#[test]
fn test_node_content_text() {
assert_eq!(
NodeContent::Paragraph {
text: "Hello".to_string()
}
.text(),
Some("Hello")
);
assert_eq!(
NodeContent::Title {
text: "Title".to_string()
}
.text(),
Some("Title")
);
assert_eq!(
NodeContent::Heading {
level: 1,
text: "H1".to_string()
}
.text(),
Some("H1")
);
assert_eq!(NodeContent::PageBreak.text(), None);
assert_eq!(NodeContent::Quote.text(), None);
assert_eq!(
NodeContent::Group {
label: None,
heading_level: None,
heading_text: None
}
.text(),
None
);
}
#[test]
fn test_serde_roundtrip() {
let mut doc = DocumentStructure::new();
let group_content = NodeContent::Group {
label: Some("section".to_string()),
heading_level: Some(1),
heading_text: Some("Introduction".to_string()),
};
let group = DocumentNode {
id: NodeId::generate("group", "Introduction", Some(1), 0),
content: group_content,
parent: None,
children: vec![],
content_layer: ContentLayer::Body,
page: Some(1),
page_end: None,
bbox: Some(BoundingBox {
x0: 10.0,
y0: 20.0,
x1: 500.0,
y1: 50.0,
}),
annotations: vec![],
};
let group_idx = doc.push_node(group);
let para_content = NodeContent::Paragraph {
text: "Hello world".to_string(),
};
let para = DocumentNode {
id: NodeId::generate("paragraph", "Hello world", Some(1), 1),
content: para_content,
parent: None,
children: vec![],
content_layer: ContentLayer::Body,
page: Some(1),
page_end: None,
bbox: None,
annotations: vec![TextAnnotation {
start: 0,
end: 5,
kind: AnnotationKind::Bold,
}],
};
let para_idx = doc.push_node(para);
doc.add_child(group_idx, para_idx);
assert!(doc.validate().is_ok());
let json = serde_json::to_string(&doc).expect("serialize");
let deserialized: DocumentStructure = serde_json::from_str(&json).expect("deserialize");
assert_eq!(deserialized.len(), 2);
assert!(deserialized.validate().is_ok());
assert_eq!(deserialized.nodes[0].children.len(), 1);
assert_eq!(deserialized.nodes[1].parent, Some(NodeIndex(0)));
}
#[test]
fn test_serde_node_type_tag() {
let content = NodeContent::Heading {
level: 2,
text: "My Heading".to_string(),
};
let json = serde_json::to_value(&content).expect("serialize");
assert_eq!(json.get("node_type").unwrap(), "heading");
assert_eq!(json.get("level").unwrap(), 2);
assert_eq!(json.get("text").unwrap(), "My Heading");
}
#[test]
fn test_serde_annotation_roundtrip() {
let annotation = TextAnnotation {
start: 10,
end: 20,
kind: AnnotationKind::Link {
url: "https://example.com".to_string(),
title: Some("Example".to_string()),
},
};
let json = serde_json::to_string(&annotation).expect("serialize");
let deserialized: TextAnnotation = serde_json::from_str(&json).expect("deserialize");
assert_eq!(deserialized.start, 10);
assert_eq!(deserialized.end, 20);
match &deserialized.kind {
AnnotationKind::Link { url, title } => {
assert_eq!(url, "https://example.com");
assert_eq!(title.as_deref(), Some("Example"));
}
_ => panic!("Expected Link annotation"),
}
}
#[test]
fn test_table_grid_serde() {
let grid = TableGrid {
rows: 2,
cols: 3,
cells: vec![
GridCell {
content: "Header 1".to_string(),
row: 0,
col: 0,
row_span: 1,
col_span: 1,
is_header: true,
bbox: None,
},
GridCell {
content: "Cell 1".to_string(),
row: 1,
col: 0,
row_span: 1,
col_span: 1,
is_header: false,
bbox: None,
},
],
};
let json = serde_json::to_string(&grid).expect("serialize");
let deserialized: TableGrid = serde_json::from_str(&json).expect("deserialize");
assert_eq!(deserialized.rows, 2);
assert_eq!(deserialized.cols, 3);
assert_eq!(deserialized.cells.len(), 2);
assert!(deserialized.cells[0].is_header);
assert!(!deserialized.cells[1].is_header);
}
#[test]
fn test_content_layer_default() {
let layer: ContentLayer = Default::default();
assert_eq!(layer, ContentLayer::Body);
}
#[test]
fn test_bounding_box_from_f32_tuple() {
let bbox: BoundingBox = (10.5f32, 20.5f32, 100.5f32, 200.5f32).into();
assert!((bbox.x0 - 10.5).abs() < f64::EPSILON);
assert!((bbox.y0 - 20.5).abs() < f64::EPSILON);
assert!((bbox.x1 - 100.5).abs() < f64::EPSILON);
assert!((bbox.y1 - 200.5).abs() < f64::EPSILON);
}
#[test]
fn test_skip_serializing_empty_fields() {
let node = make_paragraph("Simple", Some(1), 0);
let json = serde_json::to_value(&node).expect("serialize");
assert!(json.get("parent").is_none());
assert!(json.get("children").is_none());
assert!(json.get("page_end").is_none());
assert!(json.get("bbox").is_none());
assert!(json.get("annotations").is_none());
assert!(json.get("id").is_some());
assert!(json.get("content").is_some());
assert!(json.get("page").is_some());
}
}