use std::collections::HashMap;
use quick_xml::{Reader, events::Event};
use crate::{
PdfError, Result,
model::{Fragment, Page, Span},
};
pub fn parse_pdf2xml(xml: &str) -> Result<Vec<Page>> {
let mut reader = Reader::from_str(xml);
reader.config_mut().trim_text(false);
let mut pages = Vec::new();
let mut current_page: Option<Page> = None;
let mut current_fragment: Option<Fragment> = None;
let mut bold_depth = 0usize;
let mut italic_depth = 0usize;
loop {
match reader.read_event()? {
Event::Start(element) | Event::Empty(element)
if local(element.name().as_ref()) == b"page" =>
{
if let Some(page) = current_page.take() {
pages.push(page);
}
let mut number = 0u32;
let mut width = 0i32;
let mut height = 0i32;
for attr in element.attributes() {
let attr = attr.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
let value = attr
.decode_and_unescape_value(reader.decoder())
.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
match local(attr.key.as_ref()) {
b"number" => number = value.parse().unwrap_or(0),
b"width" => width = parse_coord(&value),
b"height" => height = parse_coord(&value),
_ => {}
}
}
current_page = Some(Page {
number,
width,
height,
fragments: Vec::new(),
font_sizes: HashMap::new(),
});
}
Event::Empty(element) if local(element.name().as_ref()) == b"fontspec" => {
let Some(page) = current_page.as_mut() else {
continue;
};
let mut id = None;
let mut size = None;
for attr in element.attributes() {
let attr = attr.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
let value = attr
.decode_and_unescape_value(reader.decoder())
.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
match local(attr.key.as_ref()) {
b"id" => id = value.parse::<u32>().ok(),
b"size" => size = Some(parse_coord(&value).unsigned_abs()),
_ => {}
}
}
if let (Some(id), Some(size)) = (id, size) {
page.font_sizes.insert(id, size);
}
}
Event::Start(element) if local(element.name().as_ref()) == b"text" => {
let mut fragment = Fragment {
top: 0,
left: 0,
width: 0,
height: 0,
font: 0,
spans: Vec::new(),
};
for attr in element.attributes() {
let attr = attr.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
let value = attr
.decode_and_unescape_value(reader.decoder())
.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
match local(attr.key.as_ref()) {
b"top" => fragment.top = parse_coord(&value),
b"left" => fragment.left = parse_coord(&value),
b"width" => fragment.width = parse_coord(&value),
b"height" => fragment.height = parse_coord(&value),
b"font" => fragment.font = value.parse().unwrap_or(0),
_ => {}
}
}
current_fragment = Some(fragment);
bold_depth = 0;
italic_depth = 0;
}
Event::Start(element) if current_fragment.is_some() => {
match local(element.name().as_ref()) {
b"b" => bold_depth += 1,
b"i" => italic_depth += 1,
_ => {}
}
}
Event::End(element) if current_fragment.is_some() => {
match local(element.name().as_ref()) {
b"text" => {
let fragment = current_fragment.take().expect("checked above");
if let Some(page) = current_page.as_mut()
&& fragment.spans.iter().any(|span| !span.text.is_empty())
{
page.fragments.push(fragment);
}
}
b"b" => bold_depth = bold_depth.saturating_sub(1),
b"i" => italic_depth = italic_depth.saturating_sub(1),
_ => {}
}
}
Event::Text(text) => {
if let Some(fragment) = current_fragment.as_mut() {
let value = text
.html_content()
.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
push_span(fragment, &value, bold_depth > 0, italic_depth > 0);
}
}
Event::GeneralRef(reference) => {
if let Some(fragment) = current_fragment.as_mut() {
if let Some(ch) = reference
.resolve_char_ref()
.map_err(|err| PdfError::InvalidInput(err.to_string()))?
{
push_span(fragment, &ch.to_string(), bold_depth > 0, italic_depth > 0);
} else {
let name = reference
.decode()
.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
if let Some(value) = quick_xml::escape::resolve_html5_entity(&name) {
push_span(fragment, value, bold_depth > 0, italic_depth > 0);
}
}
}
}
Event::End(element) if local(element.name().as_ref()) == b"page" => {
if let Some(page) = current_page.take() {
pages.push(page);
}
}
Event::Eof => break,
_ => {}
}
}
if let Some(page) = current_page.take() {
pages.push(page);
}
Ok(pages)
}
fn push_span(fragment: &mut Fragment, text: &str, bold: bool, italic: bool) {
if text.is_empty() {
return;
}
if let Some(last) = fragment.spans.last_mut()
&& last.bold == bold
&& last.italic == italic
{
last.text.push_str(text);
return;
}
fragment.spans.push(Span {
text: text.to_string(),
bold,
italic,
});
}
fn parse_coord(value: &str) -> i32 {
value
.parse::<i32>()
.or_else(|_| value.parse::<f64>().map(|f| f.round() as i32))
.unwrap_or(0)
}
fn local(name: &[u8]) -> &[u8] {
name.rsplit(|byte| *byte == b':').next().unwrap_or(name)
}
#[cfg(test)]
mod tests {
use super::*;
const SAMPLE: &str = r##"<?xml version="1.0" encoding="UTF-8"?>
<pdf2xml producer="poppler" version="24.02.0">
<page number="1" position="absolute" top="0" left="0" height="1188" width="918">
<fontspec id="0" size="22" family="Times" color="#000000"/>
<fontspec id="1" size="11" family="Times" color="#000000"/>
<text top="100" left="200" width="500" height="24" font="0">A <b>Bold</b> Title</text>
<text top="200" left="100" width="350" height="12" font="1">Left column line with <i>italics</i>.</text>
<text top="200" left="480" width="350" height="12" font="1">Right column & more.</text>
</page>
</pdf2xml>"##;
#[test]
fn parses_pages_fonts_and_styled_fragments() {
let pages = parse_pdf2xml(SAMPLE).expect("sample should parse");
assert_eq!(pages.len(), 1);
let page = &pages[0];
assert_eq!(page.number, 1);
assert_eq!(page.width, 918);
assert_eq!(page.font_sizes.get(&0), Some(&22));
assert_eq!(page.fragments.len(), 3);
let title = &page.fragments[0];
assert_eq!(title.font, 0);
assert_eq!(
title.spans,
vec![
Span {
text: "A ".into(),
bold: false,
italic: false
},
Span {
text: "Bold".into(),
bold: true,
italic: false
},
Span {
text: " Title".into(),
bold: false,
italic: false
},
]
);
let right = &page.fragments[2];
assert_eq!(right.spans[0].text, "Right column & more.");
}
}