Skip to main content

bookforge_pdf/
parse.rs

1//! Parser for `pdftohtml -xml` output into the page/fragment IR.
2//!
3//! The format, per page:
4//!
5//! ```xml
6//! <page number="1" width="918" height="1188" ...>
7//!   <fontspec id="0" size="17" family="Times" color="#000000"/>
8//!   <text top="246" left="261" width="394" height="18" font="0">Line with <b>bold</b></text>
9//! </page>
10//! ```
11//!
12//! `<text>` content may nest `<b>`, `<i>` (and occasionally `<a>`);
13//! styling is flattened into spans, anchors into plain text.
14
15use std::collections::HashMap;
16
17use quick_xml::{Reader, events::Event};
18
19use crate::{
20    PdfError, Result,
21    model::{Fragment, Page, Span},
22};
23
24pub fn parse_pdf2xml(xml: &str) -> Result<Vec<Page>> {
25    let mut reader = Reader::from_str(xml);
26    reader.config_mut().trim_text(false);
27
28    let mut pages = Vec::new();
29    let mut current_page: Option<Page> = None;
30    let mut current_fragment: Option<Fragment> = None;
31    let mut bold_depth = 0usize;
32    let mut italic_depth = 0usize;
33
34    loop {
35        match reader.read_event()? {
36            Event::Start(element) | Event::Empty(element)
37                if local(element.name().as_ref()) == b"page" =>
38            {
39                if let Some(page) = current_page.take() {
40                    pages.push(page);
41                }
42                let mut number = 0u32;
43                let mut width = 0i32;
44                let mut height = 0i32;
45                for attr in element.attributes() {
46                    let attr = attr.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
47                    let value = attr
48                        .decode_and_unescape_value(reader.decoder())
49                        .map_err(|err| PdfError::InvalidInput(err.to_string()))?;
50                    match local(attr.key.as_ref()) {
51                        b"number" => number = value.parse().unwrap_or(0),
52                        b"width" => width = parse_coord(&value),
53                        b"height" => height = parse_coord(&value),
54                        _ => {}
55                    }
56                }
57                current_page = Some(Page {
58                    number,
59                    width,
60                    height,
61                    fragments: Vec::new(),
62                    font_sizes: HashMap::new(),
63                });
64            }
65            Event::Empty(element) if local(element.name().as_ref()) == b"fontspec" => {
66                let Some(page) = current_page.as_mut() else {
67                    continue;
68                };
69                let mut id = None;
70                let mut size = None;
71                for attr in element.attributes() {
72                    let attr = attr.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
73                    let value = attr
74                        .decode_and_unescape_value(reader.decoder())
75                        .map_err(|err| PdfError::InvalidInput(err.to_string()))?;
76                    match local(attr.key.as_ref()) {
77                        b"id" => id = value.parse::<u32>().ok(),
78                        b"size" => size = Some(parse_coord(&value).unsigned_abs()),
79                        _ => {}
80                    }
81                }
82                if let (Some(id), Some(size)) = (id, size) {
83                    page.font_sizes.insert(id, size);
84                }
85            }
86            Event::Start(element) if local(element.name().as_ref()) == b"text" => {
87                let mut fragment = Fragment {
88                    top: 0,
89                    left: 0,
90                    width: 0,
91                    height: 0,
92                    font: 0,
93                    spans: Vec::new(),
94                };
95                for attr in element.attributes() {
96                    let attr = attr.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
97                    let value = attr
98                        .decode_and_unescape_value(reader.decoder())
99                        .map_err(|err| PdfError::InvalidInput(err.to_string()))?;
100                    match local(attr.key.as_ref()) {
101                        b"top" => fragment.top = parse_coord(&value),
102                        b"left" => fragment.left = parse_coord(&value),
103                        b"width" => fragment.width = parse_coord(&value),
104                        b"height" => fragment.height = parse_coord(&value),
105                        b"font" => fragment.font = value.parse().unwrap_or(0),
106                        _ => {}
107                    }
108                }
109                current_fragment = Some(fragment);
110                bold_depth = 0;
111                italic_depth = 0;
112            }
113            Event::Start(element) if current_fragment.is_some() => {
114                match local(element.name().as_ref()) {
115                    b"b" => bold_depth += 1,
116                    b"i" => italic_depth += 1,
117                    _ => {}
118                }
119            }
120            Event::End(element) if current_fragment.is_some() => {
121                match local(element.name().as_ref()) {
122                    b"text" => {
123                        let fragment = current_fragment.take().expect("checked above");
124                        if let Some(page) = current_page.as_mut()
125                            && fragment.spans.iter().any(|span| !span.text.is_empty())
126                        {
127                            page.fragments.push(fragment);
128                        }
129                    }
130                    b"b" => bold_depth = bold_depth.saturating_sub(1),
131                    b"i" => italic_depth = italic_depth.saturating_sub(1),
132                    _ => {}
133                }
134            }
135            Event::Text(text) => {
136                if let Some(fragment) = current_fragment.as_mut() {
137                    let value = text
138                        .html_content()
139                        .map_err(|err| PdfError::InvalidInput(err.to_string()))?;
140                    push_span(fragment, &value, bold_depth > 0, italic_depth > 0);
141                }
142            }
143            Event::GeneralRef(reference) => {
144                if let Some(fragment) = current_fragment.as_mut() {
145                    if let Some(ch) = reference
146                        .resolve_char_ref()
147                        .map_err(|err| PdfError::InvalidInput(err.to_string()))?
148                    {
149                        push_span(fragment, &ch.to_string(), bold_depth > 0, italic_depth > 0);
150                    } else {
151                        let name = reference
152                            .decode()
153                            .map_err(|err| PdfError::InvalidInput(err.to_string()))?;
154                        if let Some(value) = quick_xml::escape::resolve_html5_entity(&name) {
155                            push_span(fragment, value, bold_depth > 0, italic_depth > 0);
156                        }
157                    }
158                }
159            }
160            Event::End(element) if local(element.name().as_ref()) == b"page" => {
161                if let Some(page) = current_page.take() {
162                    pages.push(page);
163                }
164            }
165            Event::Eof => break,
166            _ => {}
167        }
168    }
169
170    if let Some(page) = current_page.take() {
171        pages.push(page);
172    }
173
174    Ok(pages)
175}
176
177fn push_span(fragment: &mut Fragment, text: &str, bold: bool, italic: bool) {
178    if text.is_empty() {
179        return;
180    }
181    if let Some(last) = fragment.spans.last_mut()
182        && last.bold == bold
183        && last.italic == italic
184    {
185        last.text.push_str(text);
186        return;
187    }
188    fragment.spans.push(Span {
189        text: text.to_string(),
190        bold,
191        italic,
192    });
193}
194
195/// pdftohtml emits integer coordinates, but some builds produce
196/// fractional values; accept both.
197fn parse_coord(value: &str) -> i32 {
198    value
199        .parse::<i32>()
200        .or_else(|_| value.parse::<f64>().map(|f| f.round() as i32))
201        .unwrap_or(0)
202}
203
204fn local(name: &[u8]) -> &[u8] {
205    name.rsplit(|byte| *byte == b':').next().unwrap_or(name)
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    const SAMPLE: &str = r##"<?xml version="1.0" encoding="UTF-8"?>
213<pdf2xml producer="poppler" version="24.02.0">
214<page number="1" position="absolute" top="0" left="0" height="1188" width="918">
215  <fontspec id="0" size="22" family="Times" color="#000000"/>
216  <fontspec id="1" size="11" family="Times" color="#000000"/>
217  <text top="100" left="200" width="500" height="24" font="0">A <b>Bold</b> Title</text>
218  <text top="200" left="100" width="350" height="12" font="1">Left column line with <i>italics</i>.</text>
219  <text top="200" left="480" width="350" height="12" font="1">Right column &amp; more.</text>
220</page>
221</pdf2xml>"##;
222
223    #[test]
224    fn parses_pages_fonts_and_styled_fragments() {
225        let pages = parse_pdf2xml(SAMPLE).expect("sample should parse");
226        assert_eq!(pages.len(), 1);
227        let page = &pages[0];
228        assert_eq!(page.number, 1);
229        assert_eq!(page.width, 918);
230        assert_eq!(page.font_sizes.get(&0), Some(&22));
231        assert_eq!(page.fragments.len(), 3);
232
233        let title = &page.fragments[0];
234        assert_eq!(title.font, 0);
235        assert_eq!(
236            title.spans,
237            vec![
238                Span {
239                    text: "A ".into(),
240                    bold: false,
241                    italic: false
242                },
243                Span {
244                    text: "Bold".into(),
245                    bold: true,
246                    italic: false
247                },
248                Span {
249                    text: " Title".into(),
250                    bold: false,
251                    italic: false
252                },
253            ]
254        );
255
256        let right = &page.fragments[2];
257        assert_eq!(right.spans[0].text, "Right column & more.");
258    }
259}