1use std::io::{Read, Seek};
2use quick_xml::events::Event;
3use quick_xml::Reader;
4use zip::ZipArchive;
5
6use crate::error::{DocxError, Result};
7use crate::types::{
8 Document, Paragraph, Run, Table, TableRow, TableCell,
9 ListItem, ListType, HeaderFooter, Note, NoteType
10};
11use std::collections::HashMap;
12
13pub struct DocxParser<R: Read + Seek> {
14 archive: ZipArchive<R>,
15}
16
17impl<R: Read + Seek> DocxParser<R> {
18 pub fn new(reader: R) -> Result<Self> {
19 let archive = ZipArchive::new(reader)?;
20 Ok(Self { archive })
21 }
22
23 pub fn parse(mut self) -> Result<Document> {
24 let mut document = Document::new();
25
26 let document_xml = self.read_document_xml()?;
28 self.parse_document_xml(&document_xml, &mut document)?;
29
30 if let Ok(numbering_xml) = self.read_file("word/numbering.xml") {
32 let numbering_defs = self.parse_numbering(&numbering_xml)?;
33 self.process_lists(&mut document, &numbering_defs);
34 }
35
36 self.parse_headers_footers(&mut document)?;
38
39 if let Ok(footnotes_xml) = self.read_file("word/footnotes.xml") {
41 self.parse_notes(&footnotes_xml, &mut document.footnotes, NoteType::Footnote)?;
42 }
43
44 if let Ok(endnotes_xml) = self.read_file("word/endnotes.xml") {
45 self.parse_notes(&endnotes_xml, &mut document.endnotes, NoteType::Endnote)?;
46 }
47
48 Ok(document)
49 }
50
51 fn read_document_xml(&mut self) -> Result<String> {
52 self.read_file("word/document.xml")
53 }
54
55 fn read_file(&mut self, path: &str) -> Result<String> {
56 let mut file = self.archive
57 .by_name(path)
58 .map_err(|_| DocxError::FileNotFound(path.to_string()))?;
59
60 let mut contents = String::new();
61 file.read_to_string(&mut contents)?;
62 Ok(contents)
63 }
64
65 fn parse_document_xml(&self, xml: &str, document: &mut Document) -> Result<()> {
66 let mut reader = Reader::from_str(xml);
67 reader.config_mut().trim_text(true);
68
69 let mut buf = Vec::new();
70 let mut current_paragraph: Option<Paragraph> = None;
71 let mut current_run: Option<Run> = None;
72 let mut current_table: Option<Table> = None;
73 let mut current_row: Option<TableRow> = None;
74 let mut current_cell: Option<TableCell> = None;
75 let mut in_text = false;
76 let mut in_table = false;
77
78 loop {
79 match reader.read_event_into(&mut buf) {
80 Ok(Event::Start(ref e)) => {
81 match e.name().as_ref() {
82 b"w:p" => {
83 if in_table {
85 if current_cell.is_none() {
87 current_cell = Some(TableCell::default());
88 }
89 } else {
90 current_paragraph = Some(Paragraph::new());
91 }
92 }
93 b"w:numPr" => {
94 if let Some(ref mut para) = current_paragraph {
96 para.numbering_id = Some(1);
100 para.numbering_level = Some(0);
101 }
102 }
103 b"w:r" => {
104 current_run = Some(Run::default());
106 }
107 b"w:t" => {
108 in_text = true;
110 }
111 b"w:tbl" => {
112 in_table = true;
114 current_table = Some(Table::new());
115 }
116 b"w:tr" => {
117 current_row = Some(TableRow::default());
119 }
120 b"w:tc" => {
121 current_cell = Some(TableCell::default());
123 }
124 b"w:b" => {
125 if let Some(ref mut run) = current_run {
127 run.bold = true;
128 }
129 }
130 b"w:i" => {
131 if let Some(ref mut run) = current_run {
133 run.italic = true;
134 }
135 }
136 b"w:u" => {
137 if let Some(ref mut run) = current_run {
139 run.underline = true;
140 }
141 }
142 _ => {}
143 }
144 }
145 Ok(Event::Text(e)) => {
146 if in_text {
147 if let Some(ref mut run) = current_run {
148 let text = e.unescape()?.into_owned();
149 run.text.push_str(&text);
150 }
151 }
152 }
153 Ok(Event::End(ref e)) => {
154 match e.name().as_ref() {
155 b"w:t" => {
156 in_text = false;
157 }
158 b"w:r" => {
159 if let Some(run) = current_run.take() {
161 if in_table {
162 if let Some(ref mut cell) = current_cell {
164 if cell.paragraphs.is_empty() {
165 cell.paragraphs.push(Paragraph::new());
166 }
167 if let Some(para) = cell.paragraphs.last_mut() {
168 para.add_run(run);
169 }
170 }
171 } else if let Some(ref mut para) = current_paragraph {
172 para.add_run(run);
173 }
174 }
175 }
176 b"w:p" => {
177 if in_table {
179 } else if let Some(para) = current_paragraph.take() {
181 document.paragraphs.push(para);
182 }
183 }
184 b"w:tc" => {
185 if let Some(cell) = current_cell.take() {
187 if let Some(ref mut row) = current_row {
188 row.cells.push(cell);
189 }
190 }
191 }
192 b"w:tr" => {
193 if let Some(row) = current_row.take() {
195 if let Some(ref mut table) = current_table {
196 table.rows.push(row);
197 }
198 }
199 }
200 b"w:tbl" => {
201 in_table = false;
203 if let Some(table) = current_table.take() {
204 document.tables.push(table);
205 }
206 }
207 _ => {}
208 }
209 }
210 Ok(Event::Eof) => break,
211 Err(e) => return Err(e.into()),
212 _ => {}
213 }
214 buf.clear();
215 }
216
217 Ok(())
218 }
219
220 fn parse_numbering(&self, xml: &str) -> Result<HashMap<i64, ListType>> {
221 let mut numbering_defs = HashMap::new();
222 let mut reader = Reader::from_str(xml);
223 reader.config_mut().trim_text(true);
224
225 let mut buf = Vec::new();
226 let mut current_num_id: Option<i64> = None;
227
228 loop {
229 match reader.read_event_into(&mut buf) {
230 Ok(Event::Start(ref e)) => {
231 if e.name().as_ref() == b"w:num" {
232 for attr in e.attributes() {
234 if let Ok(attr) = attr {
235 if attr.key.as_ref() == b"w:numId" {
236 if let Ok(id_str) = std::str::from_utf8(&attr.value) {
237 current_num_id = id_str.parse().ok();
238 }
239 }
240 }
241 }
242 }
243 }
244 Ok(Event::End(ref e)) => {
245 if e.name().as_ref() == b"w:num" {
246 if let Some(id) = current_num_id {
247 numbering_defs.insert(id, ListType::Bullet);
250 }
251 current_num_id = None;
252 }
253 }
254 Ok(Event::Eof) => break,
255 _ => {}
256 }
257 buf.clear();
258 }
259
260 Ok(numbering_defs)
261 }
262
263 fn process_lists(&self, document: &mut Document, numbering_defs: &HashMap<i64, ListType>) {
264 for paragraph in &document.paragraphs {
265 if let (Some(num_id), Some(level)) = (paragraph.numbering_id, paragraph.numbering_level) {
266 let list_type = numbering_defs.get(&num_id)
267 .cloned()
268 .unwrap_or(ListType::Bullet);
269
270 let list_item = ListItem {
271 level: level as u32,
272 list_type,
273 number: None, text: paragraph.to_text(),
275 };
276
277 document.lists.push(list_item);
278 }
279 }
280 }
281
282 fn parse_headers_footers(&mut self, document: &mut Document) -> Result<()> {
283 for i in 1..=3 {
285 let header_path = format!("word/header{}.xml", i);
286 if let Ok(header_xml) = self.read_file(&header_path) {
287 let mut header = HeaderFooter::default();
288 self.parse_header_footer_content(&header_xml, &mut header)?;
289 document.headers.push(header);
290 }
291
292 let footer_path = format!("word/footer{}.xml", i);
293 if let Ok(footer_xml) = self.read_file(&footer_path) {
294 let mut footer = HeaderFooter::default();
295 self.parse_header_footer_content(&footer_xml, &mut footer)?;
296 document.footers.push(footer);
297 }
298 }
299
300 Ok(())
301 }
302
303 fn parse_header_footer_content(&self, xml: &str, header_footer: &mut HeaderFooter) -> Result<()> {
304 let mut reader = Reader::from_str(xml);
305 reader.config_mut().trim_text(true);
306
307 let mut buf = Vec::new();
308 let mut current_paragraph: Option<Paragraph> = None;
309 let mut current_run: Option<Run> = None;
310 let mut in_text = false;
311
312 loop {
313 match reader.read_event_into(&mut buf) {
314 Ok(Event::Start(ref e)) => {
315 match e.name().as_ref() {
316 b"w:p" => current_paragraph = Some(Paragraph::new()),
317 b"w:r" => current_run = Some(Run::default()),
318 b"w:t" => in_text = true,
319 _ => {}
320 }
321 }
322 Ok(Event::Text(e)) => {
323 if in_text {
324 if let Some(ref mut run) = current_run {
325 let text = e.unescape()?.into_owned();
326 run.text.push_str(&text);
327 }
328 }
329 }
330 Ok(Event::End(ref e)) => {
331 match e.name().as_ref() {
332 b"w:t" => in_text = false,
333 b"w:r" => {
334 if let Some(run) = current_run.take() {
335 if let Some(ref mut para) = current_paragraph {
336 para.add_run(run);
337 }
338 }
339 }
340 b"w:p" => {
341 if let Some(para) = current_paragraph.take() {
342 header_footer.paragraphs.push(para);
343 }
344 }
345 _ => {}
346 }
347 }
348 Ok(Event::Eof) => break,
349 _ => {}
350 }
351 buf.clear();
352 }
353
354 Ok(())
355 }
356
357 fn parse_notes(&self, xml: &str, notes: &mut Vec<Note>, note_type: NoteType) -> Result<()> {
358 let mut reader = Reader::from_str(xml);
359 reader.config_mut().trim_text(true);
360
361 let mut buf = Vec::new();
362 let mut current_note: Option<Note> = None;
363 let mut current_paragraph: Option<Paragraph> = None;
364 let mut current_run: Option<Run> = None;
365 let mut in_text = false;
366
367 loop {
368 match reader.read_event_into(&mut buf) {
369 Ok(Event::Start(ref e)) => {
370 match e.name().as_ref() {
371 b"w:footnote" | b"w:endnote" => {
372 let mut id = String::new();
373 for attr in e.attributes() {
374 if let Ok(attr) = attr {
375 if attr.key.as_ref() == b"w:id" {
376 id = String::from_utf8_lossy(&attr.value).to_string();
377 }
378 }
379 }
380 current_note = Some(Note {
381 id,
382 note_type: note_type.clone(),
383 paragraphs: Vec::new(),
384 });
385 }
386 b"w:p" => current_paragraph = Some(Paragraph::new()),
387 b"w:r" => current_run = Some(Run::default()),
388 b"w:t" => in_text = true,
389 _ => {}
390 }
391 }
392 Ok(Event::Text(e)) => {
393 if in_text {
394 if let Some(ref mut run) = current_run {
395 let text = e.unescape()?.into_owned();
396 run.text.push_str(&text);
397 }
398 }
399 }
400 Ok(Event::End(ref e)) => {
401 match e.name().as_ref() {
402 b"w:t" => in_text = false,
403 b"w:r" => {
404 if let Some(run) = current_run.take() {
405 if let Some(ref mut para) = current_paragraph {
406 para.add_run(run);
407 }
408 }
409 }
410 b"w:p" => {
411 if let Some(para) = current_paragraph.take() {
412 if let Some(ref mut note) = current_note {
413 note.paragraphs.push(para);
414 }
415 }
416 }
417 b"w:footnote" | b"w:endnote" => {
418 if let Some(note) = current_note.take() {
419 if note.id != "-1" && note.id != "0" {
421 notes.push(note);
422 }
423 }
424 }
425 _ => {}
426 }
427 }
428 Ok(Event::Eof) => break,
429 _ => {}
430 }
431 buf.clear();
432 }
433
434 Ok(())
435 }
436}