use std::collections::HashMap;
use super::OxideDocument;
use crate::pdf::error::Result;
use crate::pdf::hierarchy::SegmentData;
pub(crate) fn extract_segments_from_page(doc: &mut OxideDocument, page_index: usize) -> Result<Vec<SegmentData>> {
extract_segments_from_page_inner(doc, page_index, &HashMap::new())
}
fn extract_segments_from_page_inner(
doc: &mut OxideDocument,
page_index: usize,
mcid_roles: &HashMap<u32, Option<u8>>,
) -> Result<Vec<SegmentData>> {
let page_height = doc
.doc
.get_page_media_box(page_index)
.ok()
.map(|(_, lly, _, ury)| (ury - lly).abs())
.unwrap_or(792.0);
let spans = match doc.doc.extract_spans(page_index) {
Ok(spans) => spans,
Err(e) => {
tracing::debug!(page = page_index, "pdf_oxide extract_spans failed for hierarchy: {e}");
return Ok(Vec::new());
}
};
let segments: Vec<SegmentData> = spans
.into_iter()
.filter(|span| {
if span.artifact_type.is_some() {
return false;
}
!span.text.trim().is_empty()
})
.map(|span| {
let is_bold = span.font_weight == pdf_oxide::layout::text_block::FontWeight::Bold;
let bbox = &span.bbox;
let screen_bottom = bbox.y + bbox.height;
let pdf_baseline_y = page_height - screen_bottom;
let pdf_y = page_height - bbox.y - bbox.height;
let assigned_role = span.mcid.and_then(|mcid| mcid_roles.get(&mcid).copied()).flatten();
SegmentData {
text: span.text,
x: bbox.x,
y: pdf_y,
width: bbox.width,
height: bbox.height,
font_size: span.font_size,
is_bold,
is_italic: span.is_italic,
is_monospace: span.is_monospace,
baseline_y: pdf_baseline_y,
assigned_role,
}
})
.collect();
Ok(segments)
}
fn extract_segments_with_structure_tree(doc: &mut OxideDocument) -> Result<(Vec<Vec<SegmentData>>, bool)> {
let mark_info = match doc.doc.mark_info() {
Ok(mi) => mi,
Err(e) => {
tracing::debug!("pdf_oxide: mark_info() failed, skipping structure tree: {e}");
return Ok((Vec::new(), false));
}
};
if !mark_info.is_structure_reliable() {
tracing::debug!(
marked = mark_info.marked,
suspects = mark_info.suspects,
"pdf_oxide: structure tree not reliable, falling back to font-size clustering"
);
return Ok((Vec::new(), false));
}
let struct_tree = match doc.doc.structure_tree() {
Ok(Some(tree)) => tree,
Ok(None) => {
tracing::debug!("pdf_oxide: no structure tree found despite marked=true");
return Ok((Vec::new(), false));
}
Err(e) => {
tracing::debug!("pdf_oxide: structure_tree() failed: {e}");
return Ok((Vec::new(), false));
}
};
let all_page_content = pdf_oxide::structure::traverse_structure_tree_all_pages(&struct_tree);
let heading_count: usize = all_page_content
.values()
.flat_map(|contents| contents.iter())
.filter(|c| c.parsed_type.heading_level().is_some())
.count();
if heading_count < 3 {
tracing::debug!(
heading_count,
"pdf_oxide: structure tree has too few heading elements (< 3), falling back to font-size clustering"
);
return Ok((Vec::new(), false));
}
let page_count = doc.doc.page_count().map_err(|e| {
crate::pdf::error::PdfError::TextExtractionFailed(format!("pdf_oxide: failed to get page count: {e}"))
})?;
let mut all_pages: Vec<Vec<SegmentData>> = Vec::with_capacity(page_count);
let mut total_role_assigned = 0usize;
for page_idx in 0..page_count {
let mcid_roles: HashMap<u32, Option<u8>> = all_page_content
.get(&(page_idx as u32))
.map(|contents| {
contents
.iter()
.filter_map(|c| c.mcid.map(|mcid| (mcid, c.parsed_type.heading_level())))
.collect()
})
.unwrap_or_default();
let segments = extract_segments_from_page_inner(doc, page_idx, &mcid_roles)?;
total_role_assigned += segments.iter().filter(|s| s.assigned_role.is_some()).count();
all_pages.push(segments);
}
tracing::debug!(
page_count,
total_role_assigned,
"pdf_oxide: structure tree heading detection complete"
);
Ok((all_pages, true))
}
pub(crate) fn extract_all_segments(doc: &mut OxideDocument) -> Result<(Vec<Vec<SegmentData>>, bool)> {
let (tree_segments, used_tree) = extract_segments_with_structure_tree(doc)?;
if used_tree && !tree_segments.is_empty() {
return Ok((tree_segments, true));
}
let page_count = doc.doc.page_count().map_err(|e| {
crate::pdf::error::PdfError::TextExtractionFailed(format!("pdf_oxide: failed to get page count: {e}"))
})?;
let mut all_pages: Vec<Vec<SegmentData>> = Vec::with_capacity(page_count);
for page_idx in 0..page_count {
let segments = extract_segments_from_page(doc, page_idx)?;
all_pages.push(segments);
}
Ok((all_pages, false))
}