1use std::collections::HashMap;
16
17use quick_xml::{Reader, events::Event};
18
19use crate::{
20 PdfError, Result,
21 model::{Fragment, Page, Span},
22};
23
24pub fn parse_pdf2xml(xml: &str) -> Result<Vec<Page>> {
25 let mut reader = Reader::from_str(xml);
26 reader.config_mut().trim_text(false);
27
28 let mut pages = Vec::new();
29 let mut current_page: Option<Page> = None;
30 let mut current_fragment: Option<Fragment> = None;
31 let mut bold_depth = 0usize;
32 let mut italic_depth = 0usize;
33
34 loop {
35 match reader.read_event()? {
36 Event::Start(element) | Event::Empty(element)
37 if local(element.name().as_ref()) == b"page" =>
38 {
39 if let Some(page) = current_page.take() {
40 pages.push(page);
41 }
42 let mut number = 0u32;
43 let mut width = 0i32;
44 let mut height = 0i32;
45 for attr in element.attributes() {
46 let attr = attr.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
47 let value = attr
48 .decode_and_unescape_value(reader.decoder())
49 .map_err(|err| PdfError::InvalidInput(err.to_string()))?;
50 match local(attr.key.as_ref()) {
51 b"number" => number = value.parse().unwrap_or(0),
52 b"width" => width = parse_coord(&value),
53 b"height" => height = parse_coord(&value),
54 _ => {}
55 }
56 }
57 current_page = Some(Page {
58 number,
59 width,
60 height,
61 fragments: Vec::new(),
62 font_sizes: HashMap::new(),
63 });
64 }
65 Event::Empty(element) if local(element.name().as_ref()) == b"fontspec" => {
66 let Some(page) = current_page.as_mut() else {
67 continue;
68 };
69 let mut id = None;
70 let mut size = None;
71 for attr in element.attributes() {
72 let attr = attr.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
73 let value = attr
74 .decode_and_unescape_value(reader.decoder())
75 .map_err(|err| PdfError::InvalidInput(err.to_string()))?;
76 match local(attr.key.as_ref()) {
77 b"id" => id = value.parse::<u32>().ok(),
78 b"size" => size = Some(parse_coord(&value).unsigned_abs()),
79 _ => {}
80 }
81 }
82 if let (Some(id), Some(size)) = (id, size) {
83 page.font_sizes.insert(id, size);
84 }
85 }
86 Event::Start(element) if local(element.name().as_ref()) == b"text" => {
87 let mut fragment = Fragment {
88 top: 0,
89 left: 0,
90 width: 0,
91 height: 0,
92 font: 0,
93 spans: Vec::new(),
94 };
95 for attr in element.attributes() {
96 let attr = attr.map_err(|err| PdfError::InvalidInput(err.to_string()))?;
97 let value = attr
98 .decode_and_unescape_value(reader.decoder())
99 .map_err(|err| PdfError::InvalidInput(err.to_string()))?;
100 match local(attr.key.as_ref()) {
101 b"top" => fragment.top = parse_coord(&value),
102 b"left" => fragment.left = parse_coord(&value),
103 b"width" => fragment.width = parse_coord(&value),
104 b"height" => fragment.height = parse_coord(&value),
105 b"font" => fragment.font = value.parse().unwrap_or(0),
106 _ => {}
107 }
108 }
109 current_fragment = Some(fragment);
110 bold_depth = 0;
111 italic_depth = 0;
112 }
113 Event::Start(element) if current_fragment.is_some() => {
114 match local(element.name().as_ref()) {
115 b"b" => bold_depth += 1,
116 b"i" => italic_depth += 1,
117 _ => {}
118 }
119 }
120 Event::End(element) if current_fragment.is_some() => {
121 match local(element.name().as_ref()) {
122 b"text" => {
123 let fragment = current_fragment.take().expect("checked above");
124 if let Some(page) = current_page.as_mut()
125 && fragment.spans.iter().any(|span| !span.text.is_empty())
126 {
127 page.fragments.push(fragment);
128 }
129 }
130 b"b" => bold_depth = bold_depth.saturating_sub(1),
131 b"i" => italic_depth = italic_depth.saturating_sub(1),
132 _ => {}
133 }
134 }
135 Event::Text(text) => {
136 if let Some(fragment) = current_fragment.as_mut() {
137 let value = text
138 .html_content()
139 .map_err(|err| PdfError::InvalidInput(err.to_string()))?;
140 push_span(fragment, &value, bold_depth > 0, italic_depth > 0);
141 }
142 }
143 Event::GeneralRef(reference) => {
144 if let Some(fragment) = current_fragment.as_mut() {
145 if let Some(ch) = reference
146 .resolve_char_ref()
147 .map_err(|err| PdfError::InvalidInput(err.to_string()))?
148 {
149 push_span(fragment, &ch.to_string(), bold_depth > 0, italic_depth > 0);
150 } else {
151 let name = reference
152 .decode()
153 .map_err(|err| PdfError::InvalidInput(err.to_string()))?;
154 if let Some(value) = quick_xml::escape::resolve_html5_entity(&name) {
155 push_span(fragment, value, bold_depth > 0, italic_depth > 0);
156 }
157 }
158 }
159 }
160 Event::End(element) if local(element.name().as_ref()) == b"page" => {
161 if let Some(page) = current_page.take() {
162 pages.push(page);
163 }
164 }
165 Event::Eof => break,
166 _ => {}
167 }
168 }
169
170 if let Some(page) = current_page.take() {
171 pages.push(page);
172 }
173
174 Ok(pages)
175}
176
177fn push_span(fragment: &mut Fragment, text: &str, bold: bool, italic: bool) {
178 if text.is_empty() {
179 return;
180 }
181 if let Some(last) = fragment.spans.last_mut()
182 && last.bold == bold
183 && last.italic == italic
184 {
185 last.text.push_str(text);
186 return;
187 }
188 fragment.spans.push(Span {
189 text: text.to_string(),
190 bold,
191 italic,
192 });
193}
194
195fn parse_coord(value: &str) -> i32 {
198 value
199 .parse::<i32>()
200 .or_else(|_| value.parse::<f64>().map(|f| f.round() as i32))
201 .unwrap_or(0)
202}
203
204fn local(name: &[u8]) -> &[u8] {
205 name.rsplit(|byte| *byte == b':').next().unwrap_or(name)
206}
207
208#[cfg(test)]
209mod tests {
210 use super::*;
211
212 const SAMPLE: &str = r##"<?xml version="1.0" encoding="UTF-8"?>
213<pdf2xml producer="poppler" version="24.02.0">
214<page number="1" position="absolute" top="0" left="0" height="1188" width="918">
215 <fontspec id="0" size="22" family="Times" color="#000000"/>
216 <fontspec id="1" size="11" family="Times" color="#000000"/>
217 <text top="100" left="200" width="500" height="24" font="0">A <b>Bold</b> Title</text>
218 <text top="200" left="100" width="350" height="12" font="1">Left column line with <i>italics</i>.</text>
219 <text top="200" left="480" width="350" height="12" font="1">Right column & more.</text>
220</page>
221</pdf2xml>"##;
222
223 #[test]
224 fn parses_pages_fonts_and_styled_fragments() {
225 let pages = parse_pdf2xml(SAMPLE).expect("sample should parse");
226 assert_eq!(pages.len(), 1);
227 let page = &pages[0];
228 assert_eq!(page.number, 1);
229 assert_eq!(page.width, 918);
230 assert_eq!(page.font_sizes.get(&0), Some(&22));
231 assert_eq!(page.fragments.len(), 3);
232
233 let title = &page.fragments[0];
234 assert_eq!(title.font, 0);
235 assert_eq!(
236 title.spans,
237 vec![
238 Span {
239 text: "A ".into(),
240 bold: false,
241 italic: false
242 },
243 Span {
244 text: "Bold".into(),
245 bold: true,
246 italic: false
247 },
248 Span {
249 text: " Title".into(),
250 bold: false,
251 italic: false
252 },
253 ]
254 );
255
256 let right = &page.fragments[2];
257 assert_eq!(right.spans[0].text, "Right column & more.");
258 }
259}