use crate::types::document_structure::GridCell;
use crate::types::{
BoundingBox, ContentLayer, DocumentNode, DocumentStructure, ExtractionResult, NodeContent, NodeId, NodeIndex,
TableGrid,
};
use super::elements::detect_list_items;
use super::types::ListType;
pub fn transform_to_document_structure(result: &ExtractionResult) -> DocumentStructure {
let mut doc = DocumentStructure::with_capacity(estimate_node_count(result));
let mut section_stack: Vec<(u8, NodeIndex)> = Vec::new();
if let Some(ref pages) = result.pages {
for page in pages {
let page_num = page.page_number as u32;
section_stack.clear();
if let Some(ref hierarchy) = page.hierarchy {
for block in &hierarchy.blocks {
let level = parse_heading_level(&block.level);
let bbox = block.bbox.map(BoundingBox::from);
if let Some(level) = level {
push_heading_group(&mut doc, &mut section_stack, level, &block.text, Some(page_num), bbox);
} else if !block.text.trim().is_empty() {
push_content_node(
&mut doc,
§ion_stack,
NodeContent::Paragraph {
text: block.text.clone(),
},
Some(page_num),
bbox,
);
}
}
}
for table_arc in &page.tables {
let table = table_arc.as_ref();
let grid = table_cells_to_grid(&table.cells);
push_content_node(
&mut doc,
§ion_stack,
NodeContent::Table { grid },
Some(page_num),
None,
);
}
for (idx, image_arc) in page.images.iter().enumerate() {
let image = image_arc.as_ref();
push_content_node(
&mut doc,
§ion_stack,
NodeContent::Image {
description: image.description.clone(),
image_index: Some(idx as u32),
src: None,
},
Some(page_num),
None,
);
}
let has_hierarchy_blocks = page.hierarchy.as_ref().is_some_and(|h| !h.blocks.is_empty());
if !has_hierarchy_blocks {
process_text_content(&mut doc, §ion_stack, &page.content, Some(page_num));
}
if result.pages.as_ref().is_some_and(|all| page.page_number < all.len()) {
push_content_node(&mut doc, §ion_stack, NodeContent::PageBreak, Some(page_num), None);
}
}
} else {
process_text_content(&mut doc, §ion_stack, &result.content, Some(1));
for table in &result.tables {
let grid = table_cells_to_grid(&table.cells);
push_content_node(
&mut doc,
§ion_stack,
NodeContent::Table { grid },
Some(table.page_number as u32),
None,
);
}
if let Some(ref images) = result.images {
for (idx, image) in images.iter().enumerate() {
let page_num = image.page_number.map(|p| p as u32).unwrap_or(1);
push_content_node(
&mut doc,
§ion_stack,
NodeContent::Image {
description: image.description.clone(),
image_index: Some(idx as u32),
src: None,
},
Some(page_num),
None,
);
}
}
}
debug_assert!(
doc.validate().is_ok(),
"DocumentStructure validation failed: {:?}",
doc.validate()
);
doc
}
fn push_heading_group(
doc: &mut DocumentStructure,
section_stack: &mut Vec<(u8, NodeIndex)>,
level: u8,
text: &str,
page: Option<u32>,
bbox: Option<BoundingBox>,
) {
while section_stack.last().is_some_and(|(l, _)| *l >= level) {
section_stack.pop();
}
let content = NodeContent::Group {
label: None,
heading_level: Some(level),
heading_text: Some(text.to_string()),
};
let index = doc.len() as u32;
let node = DocumentNode {
id: NodeId::generate("group", text, page, index),
content,
parent: None,
children: vec![],
content_layer: ContentLayer::Body,
page,
page_end: None,
bbox,
annotations: vec![],
attributes: None,
};
let group_idx = doc.push_node(node);
if let Some((_, parent_idx)) = section_stack.last() {
doc.add_child(*parent_idx, group_idx);
}
let heading_index = doc.len() as u32;
let heading_node = DocumentNode {
id: NodeId::generate("heading", text, page, heading_index),
content: NodeContent::Heading {
level,
text: text.to_string(),
},
parent: Some(group_idx),
children: vec![],
content_layer: ContentLayer::Body,
page,
page_end: None,
bbox,
annotations: vec![],
attributes: None,
};
let heading_idx = doc.push_node(heading_node);
doc.nodes[group_idx.0 as usize].children.push(heading_idx);
section_stack.push((level, group_idx));
}
fn push_content_node(
doc: &mut DocumentStructure,
section_stack: &[(u8, NodeIndex)],
content: NodeContent,
page: Option<u32>,
bbox: Option<BoundingBox>,
) -> NodeIndex {
let node_type = content.node_type_str();
let text_for_id = content.text().unwrap_or("");
let is_page_break = matches!(content, NodeContent::PageBreak);
let index = doc.len() as u32;
let node = DocumentNode {
id: NodeId::generate(node_type, text_for_id, page, index),
content,
parent: None,
children: vec![],
content_layer: ContentLayer::Body,
page,
page_end: None,
bbox,
annotations: vec![],
attributes: None,
};
let node_idx = doc.push_node(node);
if !is_page_break && let Some((_, parent_idx)) = section_stack.last() {
doc.add_child(*parent_idx, node_idx);
}
node_idx
}
fn process_text_content(
doc: &mut DocumentStructure,
section_stack: &[(u8, NodeIndex)],
content: &str,
page: Option<u32>,
) {
if content.trim().is_empty() {
return;
}
let list_items = detect_list_items(content);
if list_items.is_empty() {
add_paragraphs(doc, section_stack, content, page);
return;
}
let mut current_offset = 0;
let mut list_groups: Vec<(ListType, Vec<(usize, usize)>)> = Vec::new();
for item in &list_items {
if current_offset < item.byte_start {
let text_before = &content[current_offset..item.byte_start];
add_paragraphs(doc, section_stack, text_before, page);
}
if list_groups.last().is_some_and(|(t, _)| *t == item.list_type) {
if let Some(group) = list_groups.last_mut() {
group.1.push((item.byte_start, item.byte_end));
}
} else {
list_groups.push((item.list_type, vec![(item.byte_start, item.byte_end)]));
}
current_offset = item.byte_end;
}
for (list_type, items) in &list_groups {
let ordered = matches!(list_type, ListType::Numbered | ListType::Lettered);
let list_content = NodeContent::List { ordered };
let list_index = doc.len() as u32;
let list_node = DocumentNode {
id: NodeId::generate("list", &format!("{:?}_{}", list_type, items.len()), page, list_index),
content: list_content,
parent: None,
children: vec![],
content_layer: ContentLayer::Body,
page,
page_end: None,
bbox: None,
annotations: vec![],
attributes: None,
};
let list_idx = doc.push_node(list_node);
if let Some((_, parent_idx)) = section_stack.last() {
doc.add_child(*parent_idx, list_idx);
}
for (start, end) in items {
let item_text = content[*start..*end].trim();
let clean_text = strip_list_marker(item_text);
let item_content = NodeContent::ListItem {
text: clean_text.to_string(),
};
let item_index = doc.len() as u32;
let item_node = DocumentNode {
id: NodeId::generate("list_item", clean_text, page, item_index),
content: item_content,
parent: Some(list_idx),
children: vec![],
content_layer: ContentLayer::Body,
page,
page_end: None,
bbox: None,
annotations: vec![],
attributes: None,
};
let item_idx = doc.push_node(item_node);
doc.nodes[list_idx.0 as usize].children.push(item_idx);
}
}
if current_offset < content.len() {
let text_after = &content[current_offset..];
add_paragraphs(doc, section_stack, text_after, page);
}
}
fn add_paragraphs(doc: &mut DocumentStructure, section_stack: &[(u8, NodeIndex)], text: &str, page: Option<u32>) {
let mut local_stack: Vec<(u8, NodeIndex)> = section_stack.to_vec();
for paragraph in text.split("\n\n").filter(|p| !p.trim().is_empty()) {
let para_text = paragraph.trim();
if para_text.is_empty() {
continue;
}
if let Some((level, heading_text)) = parse_markdown_heading(para_text) {
push_heading_group(doc, &mut local_stack, level, heading_text, page, None);
} else {
push_content_node(
doc,
&local_stack,
NodeContent::Paragraph {
text: para_text.to_string(),
},
page,
None,
);
}
}
}
fn parse_markdown_heading(line: &str) -> Option<(u8, &str)> {
let trimmed = line.trim();
if !trimmed.starts_with('#') {
return None;
}
let hashes = trimmed.bytes().take_while(|&b| b == b'#').count();
if hashes == 0 || hashes > 6 {
return None;
}
let rest = &trimmed[hashes..];
if !rest.is_empty() && !rest.starts_with(' ') {
return None;
}
let text = rest.trim();
if text.is_empty() {
return None;
}
Some((hashes as u8, text))
}
fn table_cells_to_grid(cells: &[Vec<String>]) -> TableGrid {
let rows = cells.len() as u32;
let cols = cells.iter().map(|r| r.len()).max().unwrap_or(0) as u32;
let mut grid_cells = Vec::new();
for (row_idx, row) in cells.iter().enumerate() {
for (col_idx, cell_content) in row.iter().enumerate() {
grid_cells.push(GridCell {
content: cell_content.clone(),
row: row_idx as u32,
col: col_idx as u32,
row_span: 1,
col_span: 1,
is_header: row_idx == 0, bbox: None,
});
}
}
TableGrid {
rows,
cols,
cells: grid_cells,
}
}
fn parse_heading_level(level: &str) -> Option<u8> {
match level {
"h1" => Some(1),
"h2" => Some(2),
"h3" => Some(3),
"h4" => Some(4),
"h5" => Some(5),
"h6" => Some(6),
_ => None,
}
}
fn strip_list_marker(text: &str) -> &str {
let trimmed = text.trim_start();
for prefix in &["- ", "* ", "• "] {
if let Some(rest) = trimmed.strip_prefix(prefix) {
return rest;
}
}
if let Some(dot_pos) = trimmed.find('.') {
let prefix = &trimmed[..dot_pos];
if prefix.chars().all(|c| c.is_ascii_digit())
&& dot_pos > 0
&& dot_pos < 3
&& let Some(rest) = trimmed[dot_pos + 1..].strip_prefix(' ')
{
return rest;
}
if prefix.len() == 1
&& prefix.chars().all(|c| c.is_alphabetic())
&& let Some(rest) = trimmed[dot_pos + 1..].strip_prefix(' ')
{
return rest;
}
}
trimmed
}
fn estimate_node_count(result: &ExtractionResult) -> usize {
let base = if let Some(ref pages) = result.pages {
pages
.iter()
.map(|p| {
let hierarchy_count = p.hierarchy.as_ref().map(|h| h.blocks.len()).unwrap_or(0);
let table_count = p.tables.len();
let image_count = p.images.len();
hierarchy_count + table_count + image_count + 5
})
.sum()
} else {
(result.content.len() / 200).max(4)
};
base + result.tables.len()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{ExtractionResult, HierarchicalBlock, Metadata, PageContent, PageHierarchy, Table};
use std::borrow::Cow;
fn test_metadata() -> Metadata {
Metadata {
title: None,
subject: None,
authors: None,
keywords: None,
language: None,
created_at: None,
modified_at: None,
created_by: None,
modified_by: None,
pages: None,
format: None,
image_preprocessing: None,
json_schema: None,
error: None,
extraction_duration_ms: None,
category: None,
tags: None,
document_version: None,
abstract_text: None,
output_format: None,
additional: Default::default(),
}
}
fn test_result(content: &str) -> ExtractionResult {
ExtractionResult {
content: content.to_string(),
mime_type: Cow::Borrowed("text/plain"),
metadata: test_metadata(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
children: None,
uris: None,
#[cfg(feature = "tree-sitter")]
code_intelligence: None,
formatted_content: None,
ocr_internal_document: None,
}
}
#[test]
fn test_simple_paragraphs() {
let result = test_result("First paragraph.\n\nSecond paragraph.\n\nThird paragraph.");
let doc = transform_to_document_structure(&result);
assert!(doc.validate().is_ok());
assert_eq!(doc.len(), 3);
let body: Vec<_> = doc.body_roots().collect();
assert_eq!(body.len(), 3);
}
#[test]
fn test_list_detection() {
let result = test_result("- First item\n- Second item\n- Third item");
let doc = transform_to_document_structure(&result);
assert!(doc.validate().is_ok());
assert_eq!(doc.len(), 4);
let roots: Vec<_> = doc.body_roots().collect();
assert_eq!(roots.len(), 1);
match &roots[0].1.content {
NodeContent::List { ordered } => assert!(!ordered),
_ => panic!("Expected List node"),
}
assert_eq!(doc.nodes[0].children.len(), 3);
}
#[test]
fn test_heading_driven_sections() {
let result = ExtractionResult {
pages: Some(vec![PageContent {
page_number: 1,
content: "Body text under heading.".to_string(),
tables: vec![],
images: vec![],
hierarchy: Some(PageHierarchy {
block_count: 3,
blocks: vec![
HierarchicalBlock {
text: "Main Title".to_string(),
font_size: 24.0,
level: "h1".to_string(),
bbox: Some((10.0, 20.0, 500.0, 50.0)),
},
HierarchicalBlock {
text: "Subtitle".to_string(),
font_size: 18.0,
level: "h2".to_string(),
bbox: None,
},
HierarchicalBlock {
text: "Body text from hierarchy.".to_string(),
font_size: 12.0,
level: "body".to_string(),
bbox: None,
},
],
}),
is_blank: None,
}]),
..test_result("")
};
let doc = transform_to_document_structure(&result);
assert!(doc.validate().is_ok());
let roots: Vec<_> = doc.body_roots().collect();
assert_eq!(roots.len(), 1);
match &roots[0].1.content {
NodeContent::Group {
heading_level,
heading_text,
..
} => {
assert_eq!(*heading_level, Some(1));
assert_eq!(heading_text.as_deref(), Some("Main Title"));
}
_ => panic!("Expected Group node"),
}
let h1_children = &doc.nodes[0].children;
assert!(!h1_children.is_empty());
}
#[test]
fn test_multiple_h1_sections() {
let result = ExtractionResult {
pages: Some(vec![PageContent {
page_number: 1,
content: String::new(),
tables: vec![],
images: vec![],
hierarchy: Some(PageHierarchy {
block_count: 2,
blocks: vec![
HierarchicalBlock {
text: "First Section".to_string(),
font_size: 24.0,
level: "h1".to_string(),
bbox: None,
},
HierarchicalBlock {
text: "Second Section".to_string(),
font_size: 24.0,
level: "h1".to_string(),
bbox: None,
},
],
}),
is_blank: None,
}]),
..test_result("")
};
let doc = transform_to_document_structure(&result);
assert!(doc.validate().is_ok());
let roots: Vec<_> = doc.body_roots().collect();
assert_eq!(roots.len(), 2);
}
#[test]
fn test_skipped_heading_levels() {
let result = ExtractionResult {
pages: Some(vec![PageContent {
page_number: 1,
content: String::new(),
tables: vec![],
images: vec![],
hierarchy: Some(PageHierarchy {
block_count: 2,
blocks: vec![
HierarchicalBlock {
text: "Title".to_string(),
font_size: 24.0,
level: "h1".to_string(),
bbox: None,
},
HierarchicalBlock {
text: "Subsub".to_string(),
font_size: 14.0,
level: "h3".to_string(),
bbox: None,
},
],
}),
is_blank: None,
}]),
..test_result("")
};
let doc = transform_to_document_structure(&result);
assert!(doc.validate().is_ok());
assert_eq!(doc.nodes[0].children.len(), 2);
let heading_idx = doc.nodes[0].children[0];
assert!(matches!(
doc.nodes[heading_idx.0 as usize].content,
NodeContent::Heading { level: 1, .. }
));
let h3_idx = doc.nodes[0].children[1];
assert_eq!(doc.nodes[h3_idx.0 as usize].parent, Some(NodeIndex(0)));
}
#[test]
fn test_no_headings_flat_paragraphs() {
let result = test_result("Paragraph one.\n\nParagraph two.");
let doc = transform_to_document_structure(&result);
assert!(doc.validate().is_ok());
assert_eq!(doc.len(), 2);
for node in &doc.nodes {
assert!(node.parent.is_none());
}
}
#[test]
fn test_table_grid_conversion() {
let result = ExtractionResult {
tables: vec![Table {
cells: vec![
vec!["Name".to_string(), "Age".to_string()],
vec!["Alice".to_string(), "30".to_string()],
],
markdown: "| Name | Age |\n|---|---|\n| Alice | 30 |".to_string(),
page_number: 1,
bounding_box: None,
}],
..test_result("Some content")
};
let doc = transform_to_document_structure(&result);
assert!(doc.validate().is_ok());
let table_node = doc
.nodes
.iter()
.find(|n| matches!(n.content, NodeContent::Table { .. }));
assert!(table_node.is_some());
if let NodeContent::Table { ref grid } = table_node.unwrap().content {
assert_eq!(grid.rows, 2);
assert_eq!(grid.cols, 2);
assert_eq!(grid.cells.len(), 4);
assert!(grid.cells[0].is_header); assert!(!grid.cells[2].is_header);
}
}
#[test]
fn test_serde_roundtrip() {
let result = test_result("Hello world.\n\n- Item 1\n- Item 2");
let doc = transform_to_document_structure(&result);
let json = serde_json::to_string(&doc).expect("serialize");
let deserialized: DocumentStructure = serde_json::from_str(&json).expect("deserialize");
assert_eq!(deserialized.len(), doc.len());
assert!(deserialized.validate().is_ok());
}
#[test]
fn test_strip_list_marker() {
assert_eq!(strip_list_marker("- item"), "item");
assert_eq!(strip_list_marker("* item"), "item");
assert_eq!(strip_list_marker("• item"), "item");
assert_eq!(strip_list_marker("1. item"), "item");
assert_eq!(strip_list_marker("a. item"), "item");
assert_eq!(strip_list_marker("plain text"), "plain text");
}
#[test]
fn test_empty_content() {
let result = test_result("");
let doc = transform_to_document_structure(&result);
assert!(doc.validate().is_ok());
assert!(doc.is_empty());
}
}