docx_lite/
parser.rs

1use std::io::{Read, Seek};
2use quick_xml::events::Event;
3use quick_xml::Reader;
4use zip::ZipArchive;
5
6use crate::error::{DocxError, Result};
7use crate::types::{
8    Document, Paragraph, Run, Table, TableRow, TableCell,
9    ListItem, ListType, HeaderFooter, Note, NoteType
10};
11use std::collections::HashMap;
12
13pub struct DocxParser<R: Read + Seek> {
14    archive: ZipArchive<R>,
15}
16
17impl<R: Read + Seek> DocxParser<R> {
18    pub fn new(reader: R) -> Result<Self> {
19        let archive = ZipArchive::new(reader)?;
20        Ok(Self { archive })
21    }
22
23    pub fn parse(mut self) -> Result<Document> {
24        let mut document = Document::new();
25
26        // Extract main document content
27        let document_xml = self.read_document_xml()?;
28        self.parse_document_xml(&document_xml, &mut document)?;
29
30        // Try to parse numbering definitions for lists
31        if let Ok(numbering_xml) = self.read_file("word/numbering.xml") {
32            let numbering_defs = self.parse_numbering(&numbering_xml)?;
33            self.process_lists(&mut document, &numbering_defs);
34        }
35
36        // Try to parse headers and footers
37        self.parse_headers_footers(&mut document)?;
38
39        // Try to parse footnotes and endnotes
40        if let Ok(footnotes_xml) = self.read_file("word/footnotes.xml") {
41            self.parse_notes(&footnotes_xml, &mut document.footnotes, NoteType::Footnote)?;
42        }
43
44        if let Ok(endnotes_xml) = self.read_file("word/endnotes.xml") {
45            self.parse_notes(&endnotes_xml, &mut document.endnotes, NoteType::Endnote)?;
46        }
47
48        Ok(document)
49    }
50
51    fn read_document_xml(&mut self) -> Result<String> {
52        self.read_file("word/document.xml")
53    }
54
55    fn read_file(&mut self, path: &str) -> Result<String> {
56        let mut file = self.archive
57            .by_name(path)
58            .map_err(|_| DocxError::FileNotFound(path.to_string()))?;
59
60        let mut contents = String::new();
61        file.read_to_string(&mut contents)?;
62        Ok(contents)
63    }
64
65    fn parse_document_xml(&self, xml: &str, document: &mut Document) -> Result<()> {
66        let mut reader = Reader::from_str(xml);
67        reader.config_mut().trim_text(true);
68
69        let mut buf = Vec::new();
70        let mut current_paragraph: Option<Paragraph> = None;
71        let mut current_run: Option<Run> = None;
72        let mut current_table: Option<Table> = None;
73        let mut current_row: Option<TableRow> = None;
74        let mut current_cell: Option<TableCell> = None;
75        let mut in_text = false;
76        let mut in_table = false;
77
78        loop {
79            match reader.read_event_into(&mut buf) {
80                Ok(Event::Start(ref e)) => {
81                    match e.name().as_ref() {
82                        b"w:p" => {
83                            // Start of a paragraph
84                            if in_table {
85                                // Paragraph inside a table cell
86                                if current_cell.is_none() {
87                                    current_cell = Some(TableCell::default());
88                                }
89                            } else {
90                                current_paragraph = Some(Paragraph::new());
91                            }
92                        }
93                        b"w:numPr" => {
94                            // Numbering properties (list item)
95                            if let Some(ref mut para) = current_paragraph {
96                                // Parse numId and ilvl from attributes or child elements
97                                // For simplicity, we'll set defaults here
98                                // In a full implementation, we'd parse the XML attributes
99                                para.numbering_id = Some(1);
100                                para.numbering_level = Some(0);
101                            }
102                        }
103                        b"w:r" => {
104                            // Start of a run
105                            current_run = Some(Run::default());
106                        }
107                        b"w:t" => {
108                            // Text element
109                            in_text = true;
110                        }
111                        b"w:tbl" => {
112                            // Start of a table
113                            in_table = true;
114                            current_table = Some(Table::new());
115                        }
116                        b"w:tr" => {
117                            // Table row
118                            current_row = Some(TableRow::default());
119                        }
120                        b"w:tc" => {
121                            // Table cell
122                            current_cell = Some(TableCell::default());
123                        }
124                        b"w:b" => {
125                            // Bold formatting
126                            if let Some(ref mut run) = current_run {
127                                run.bold = true;
128                            }
129                        }
130                        b"w:i" => {
131                            // Italic formatting
132                            if let Some(ref mut run) = current_run {
133                                run.italic = true;
134                            }
135                        }
136                        b"w:u" => {
137                            // Underline formatting
138                            if let Some(ref mut run) = current_run {
139                                run.underline = true;
140                            }
141                        }
142                        _ => {}
143                    }
144                }
145                Ok(Event::Text(e)) => {
146                    if in_text {
147                        if let Some(ref mut run) = current_run {
148                            let text = e.unescape()?.into_owned();
149                            run.text.push_str(&text);
150                        }
151                    }
152                }
153                Ok(Event::End(ref e)) => {
154                    match e.name().as_ref() {
155                        b"w:t" => {
156                            in_text = false;
157                        }
158                        b"w:r" => {
159                            // End of a run
160                            if let Some(run) = current_run.take() {
161                                if in_table {
162                                    // Add run to table cell paragraph
163                                    if let Some(ref mut cell) = current_cell {
164                                        if cell.paragraphs.is_empty() {
165                                            cell.paragraphs.push(Paragraph::new());
166                                        }
167                                        if let Some(para) = cell.paragraphs.last_mut() {
168                                            para.add_run(run);
169                                        }
170                                    }
171                                } else if let Some(ref mut para) = current_paragraph {
172                                    para.add_run(run);
173                                }
174                            }
175                        }
176                        b"w:p" => {
177                            // End of a paragraph
178                            if in_table {
179                                // Paragraph inside table cell already handled
180                            } else if let Some(para) = current_paragraph.take() {
181                                document.paragraphs.push(para);
182                            }
183                        }
184                        b"w:tc" => {
185                            // End of table cell
186                            if let Some(cell) = current_cell.take() {
187                                if let Some(ref mut row) = current_row {
188                                    row.cells.push(cell);
189                                }
190                            }
191                        }
192                        b"w:tr" => {
193                            // End of table row
194                            if let Some(row) = current_row.take() {
195                                if let Some(ref mut table) = current_table {
196                                    table.rows.push(row);
197                                }
198                            }
199                        }
200                        b"w:tbl" => {
201                            // End of table
202                            in_table = false;
203                            if let Some(table) = current_table.take() {
204                                document.tables.push(table);
205                            }
206                        }
207                        _ => {}
208                    }
209                }
210                Ok(Event::Eof) => break,
211                Err(e) => return Err(e.into()),
212                _ => {}
213            }
214            buf.clear();
215        }
216
217        Ok(())
218    }
219
220    fn parse_numbering(&self, xml: &str) -> Result<HashMap<i64, ListType>> {
221        let mut numbering_defs = HashMap::new();
222        let mut reader = Reader::from_str(xml);
223        reader.config_mut().trim_text(true);
224
225        let mut buf = Vec::new();
226        let mut current_num_id: Option<i64> = None;
227
228        loop {
229            match reader.read_event_into(&mut buf) {
230                Ok(Event::Start(ref e)) => {
231                    if e.name().as_ref() == b"w:num" {
232                        // Try to get numId attribute
233                        for attr in e.attributes() {
234                            if let Ok(attr) = attr {
235                                if attr.key.as_ref() == b"w:numId" {
236                                    if let Ok(id_str) = std::str::from_utf8(&attr.value) {
237                                        current_num_id = id_str.parse().ok();
238                                    }
239                                }
240                            }
241                        }
242                    }
243                }
244                Ok(Event::End(ref e)) => {
245                    if e.name().as_ref() == b"w:num" {
246                        if let Some(id) = current_num_id {
247                            // For simplicity, default to bullet
248                            // In a full implementation, we'd check the abstractNum
249                            numbering_defs.insert(id, ListType::Bullet);
250                        }
251                        current_num_id = None;
252                    }
253                }
254                Ok(Event::Eof) => break,
255                _ => {}
256            }
257            buf.clear();
258        }
259
260        Ok(numbering_defs)
261    }
262
263    fn process_lists(&self, document: &mut Document, numbering_defs: &HashMap<i64, ListType>) {
264        for paragraph in &document.paragraphs {
265            if let (Some(num_id), Some(level)) = (paragraph.numbering_id, paragraph.numbering_level) {
266                let list_type = numbering_defs.get(&num_id)
267                    .cloned()
268                    .unwrap_or(ListType::Bullet);
269
270                let list_item = ListItem {
271                    level: level as u32,
272                    list_type,
273                    number: None, // Would need counter tracking for numbered lists
274                    text: paragraph.to_text(),
275                };
276
277                document.lists.push(list_item);
278            }
279        }
280    }
281
282    fn parse_headers_footers(&mut self, document: &mut Document) -> Result<()> {
283        // Try to parse headers
284        for i in 1..=3 {
285            let header_path = format!("word/header{}.xml", i);
286            if let Ok(header_xml) = self.read_file(&header_path) {
287                let mut header = HeaderFooter::default();
288                self.parse_header_footer_content(&header_xml, &mut header)?;
289                document.headers.push(header);
290            }
291
292            let footer_path = format!("word/footer{}.xml", i);
293            if let Ok(footer_xml) = self.read_file(&footer_path) {
294                let mut footer = HeaderFooter::default();
295                self.parse_header_footer_content(&footer_xml, &mut footer)?;
296                document.footers.push(footer);
297            }
298        }
299
300        Ok(())
301    }
302
303    fn parse_header_footer_content(&self, xml: &str, header_footer: &mut HeaderFooter) -> Result<()> {
304        let mut reader = Reader::from_str(xml);
305        reader.config_mut().trim_text(true);
306
307        let mut buf = Vec::new();
308        let mut current_paragraph: Option<Paragraph> = None;
309        let mut current_run: Option<Run> = None;
310        let mut in_text = false;
311
312        loop {
313            match reader.read_event_into(&mut buf) {
314                Ok(Event::Start(ref e)) => {
315                    match e.name().as_ref() {
316                        b"w:p" => current_paragraph = Some(Paragraph::new()),
317                        b"w:r" => current_run = Some(Run::default()),
318                        b"w:t" => in_text = true,
319                        _ => {}
320                    }
321                }
322                Ok(Event::Text(e)) => {
323                    if in_text {
324                        if let Some(ref mut run) = current_run {
325                            let text = e.unescape()?.into_owned();
326                            run.text.push_str(&text);
327                        }
328                    }
329                }
330                Ok(Event::End(ref e)) => {
331                    match e.name().as_ref() {
332                        b"w:t" => in_text = false,
333                        b"w:r" => {
334                            if let Some(run) = current_run.take() {
335                                if let Some(ref mut para) = current_paragraph {
336                                    para.add_run(run);
337                                }
338                            }
339                        }
340                        b"w:p" => {
341                            if let Some(para) = current_paragraph.take() {
342                                header_footer.paragraphs.push(para);
343                            }
344                        }
345                        _ => {}
346                    }
347                }
348                Ok(Event::Eof) => break,
349                _ => {}
350            }
351            buf.clear();
352        }
353
354        Ok(())
355    }
356
357    fn parse_notes(&self, xml: &str, notes: &mut Vec<Note>, note_type: NoteType) -> Result<()> {
358        let mut reader = Reader::from_str(xml);
359        reader.config_mut().trim_text(true);
360
361        let mut buf = Vec::new();
362        let mut current_note: Option<Note> = None;
363        let mut current_paragraph: Option<Paragraph> = None;
364        let mut current_run: Option<Run> = None;
365        let mut in_text = false;
366
367        loop {
368            match reader.read_event_into(&mut buf) {
369                Ok(Event::Start(ref e)) => {
370                    match e.name().as_ref() {
371                        b"w:footnote" | b"w:endnote" => {
372                            let mut id = String::new();
373                            for attr in e.attributes() {
374                                if let Ok(attr) = attr {
375                                    if attr.key.as_ref() == b"w:id" {
376                                        id = String::from_utf8_lossy(&attr.value).to_string();
377                                    }
378                                }
379                            }
380                            current_note = Some(Note {
381                                id,
382                                note_type: note_type.clone(),
383                                paragraphs: Vec::new(),
384                            });
385                        }
386                        b"w:p" => current_paragraph = Some(Paragraph::new()),
387                        b"w:r" => current_run = Some(Run::default()),
388                        b"w:t" => in_text = true,
389                        _ => {}
390                    }
391                }
392                Ok(Event::Text(e)) => {
393                    if in_text {
394                        if let Some(ref mut run) = current_run {
395                            let text = e.unescape()?.into_owned();
396                            run.text.push_str(&text);
397                        }
398                    }
399                }
400                Ok(Event::End(ref e)) => {
401                    match e.name().as_ref() {
402                        b"w:t" => in_text = false,
403                        b"w:r" => {
404                            if let Some(run) = current_run.take() {
405                                if let Some(ref mut para) = current_paragraph {
406                                    para.add_run(run);
407                                }
408                            }
409                        }
410                        b"w:p" => {
411                            if let Some(para) = current_paragraph.take() {
412                                if let Some(ref mut note) = current_note {
413                                    note.paragraphs.push(para);
414                                }
415                            }
416                        }
417                        b"w:footnote" | b"w:endnote" => {
418                            if let Some(note) = current_note.take() {
419                                // Skip separator and continuation notes (ids -1 and 0)
420                                if note.id != "-1" && note.id != "0" {
421                                    notes.push(note);
422                                }
423                            }
424                        }
425                        _ => {}
426                    }
427                }
428                Ok(Event::Eof) => break,
429                _ => {}
430            }
431            buf.clear();
432        }
433
434        Ok(())
435    }
436}