use crate::document::PdfDocument;
use crate::elements::{ContentElement, StructureElement};
use crate::error::Result;
use crate::geometry::Rect;
use std::collections::HashMap;
pub struct HierarchicalExtractor;
impl HierarchicalExtractor {
pub fn extract_page(
document: &mut PdfDocument,
page_index: usize,
) -> Result<Option<StructureElement>> {
let page_count = document.page_count()?;
if page_index >= page_count {
return Err(crate::error::Error::InvalidPdf(format!(
"Page index {} out of range (document has {} pages)",
page_index, page_count
)));
}
let _has_structure_tree = document.structure_tree()?.is_some();
Self::generate_synthetic_structure(document, page_index)
}
pub fn generate_synthetic_structure(
document: &mut PdfDocument,
page_index: usize,
) -> Result<Option<StructureElement>> {
use crate::elements::{ContentElement, TextContent};
let text_spans = match document.extract_spans(page_index) {
Ok(spans) => spans,
Err(crate::error::Error::ParseError { reason, .. })
if reason.contains("no Contents") =>
{
Vec::new()
},
Err(e) => return Err(e),
};
let children: Vec<ContentElement> = text_spans
.into_iter()
.map(|span| ContentElement::Text(TextContent::from(span)))
.collect();
let bbox = if children.is_empty() {
Rect::new(0.0, 0.0, 595.0, 842.0)
} else {
let mut min_x = f32::MAX;
let mut min_y = f32::MAX;
let mut max_x = f32::MIN;
let mut max_y = f32::MIN;
for child in &children {
let child_bbox = child.bbox();
min_x = min_x.min(child_bbox.x);
min_y = min_y.min(child_bbox.y);
max_x = max_x.max(child_bbox.x + child_bbox.width);
max_y = max_y.max(child_bbox.y + child_bbox.height);
}
Rect::new(min_x, min_y, max_x - min_x, max_y - min_y)
};
Ok(Some(StructureElement {
structure_type: "Document".to_string(),
bbox,
children,
reading_order: Some(0),
alt_text: None,
language: None,
}))
}
pub fn extract_content_with_mcids(
_document: &mut PdfDocument,
_page_index: usize,
) -> Result<HashMap<u32, Vec<ContentElement>>> {
let _mcid_map: HashMap<u32, Vec<ContentElement>> = HashMap::new();
Ok(HashMap::new())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hierarchical_extractor_creation() {
let _extractor = HierarchicalExtractor;
}
}