use std::cell::RefCell;
use std::path::PathBuf;
#[cfg(feature = "pdf")]
use crate::pdf::hierarchy::SegmentData;
thread_local! {
static CURRENT_PDF_PATH: RefCell<Option<PathBuf>> = const { RefCell::new(None) };
}
pub(crate) fn set_current_pdf_path(path: Option<PathBuf>) {
CURRENT_PDF_PATH.with(|cell| {
*cell.borrow_mut() = path;
});
}
pub(crate) fn current_pdf_path() -> Option<PathBuf> {
CURRENT_PDF_PATH.with(|cell| cell.borrow().clone())
}
#[cfg(feature = "pdf")]
pub(crate) fn extract_segments_with_oxide(page_count: usize) -> Option<Vec<Vec<SegmentData>>> {
let file_path = match current_pdf_path() {
Some(p) => {
tracing::debug!(path = %p.display(), "pdf_oxide: file path available");
p
}
None => {
tracing::debug!("pdf_oxide: no file path set (bytes-only extraction), skipping");
return None;
}
};
let mut pdf = match pdf_oxide::api::Pdf::open(&file_path) {
Ok(pdf) => pdf,
Err(e) => {
tracing::debug!("pdf_oxide failed to open document: {e}");
return None;
}
};
let mut all_pages: Vec<Vec<SegmentData>> = Vec::with_capacity(page_count);
for page_idx in 0..page_count {
let page_height = pdf
.page_media_box(page_idx)
.ok()
.map(|[_, lly, _, ury]| (ury - lly).abs())
.unwrap_or(792.0);
let spans = match pdf.extract_spans(page_idx) {
Ok(spans) => spans,
Err(e) => {
tracing::debug!(page = page_idx, "pdf_oxide extract_spans failed: {e}");
all_pages.push(Vec::new());
continue;
}
};
let segments: Vec<SegmentData> = spans
.into_iter()
.filter(|span| {
if span.artifact_type.is_some() {
return false;
}
!span.text.trim().is_empty()
})
.map(|span| {
let is_bold = span.font_weight == pdf_oxide::layout::text_block::FontWeight::Bold;
let bbox = &span.bbox;
let screen_bottom = bbox.y + bbox.height;
let pdf_baseline_y = page_height - screen_bottom;
let pdf_y = page_height - bbox.y - bbox.height;
SegmentData {
text: span.text,
x: bbox.x,
y: pdf_y,
width: bbox.width,
height: bbox.height,
font_size: span.font_size,
is_bold,
is_italic: span.is_italic,
is_monospace: span.font_name.contains("Mono")
|| span.font_name.contains("Courier")
|| span.font_name.contains("Consola"),
baseline_y: pdf_baseline_y,
}
})
.collect();
all_pages.push(segments);
}
Some(all_pages)
}