docx_lite/
parser.rs

1use std::io::{Read, Seek};
2use quick_xml::events::Event;
3use quick_xml::Reader;
4use zip::ZipArchive;
5
6use crate::error::{DocxError, Result};
7use crate::types::{Document, Paragraph, Run, Table, TableRow, TableCell};
8
9pub struct DocxParser<R: Read + Seek> {
10    archive: ZipArchive<R>,
11}
12
13impl<R: Read + Seek> DocxParser<R> {
14    pub fn new(reader: R) -> Result<Self> {
15        let archive = ZipArchive::new(reader)?;
16        Ok(Self { archive })
17    }
18
19    pub fn parse(mut self) -> Result<Document> {
20        let mut document = Document::new();
21
22        // Extract main document content
23        let document_xml = self.read_document_xml()?;
24        self.parse_document_xml(&document_xml, &mut document)?;
25
26        Ok(document)
27    }
28
29    fn read_document_xml(&mut self) -> Result<String> {
30        let mut file = self.archive
31            .by_name("word/document.xml")
32            .map_err(|_| DocxError::FileNotFound("word/document.xml".to_string()))?;
33
34        let mut contents = String::new();
35        file.read_to_string(&mut contents)?;
36        Ok(contents)
37    }
38
39    fn parse_document_xml(&self, xml: &str, document: &mut Document) -> Result<()> {
40        let mut reader = Reader::from_str(xml);
41        reader.config_mut().trim_text(true);
42
43        let mut buf = Vec::new();
44        let mut current_paragraph: Option<Paragraph> = None;
45        let mut current_run: Option<Run> = None;
46        let mut current_table: Option<Table> = None;
47        let mut current_row: Option<TableRow> = None;
48        let mut current_cell: Option<TableCell> = None;
49        let mut in_text = false;
50        let mut in_table = false;
51
52        loop {
53            match reader.read_event_into(&mut buf) {
54                Ok(Event::Start(ref e)) => {
55                    match e.name().as_ref() {
56                        b"w:p" => {
57                            // Start of a paragraph
58                            if in_table {
59                                // Paragraph inside a table cell
60                                if current_cell.is_none() {
61                                    current_cell = Some(TableCell::default());
62                                }
63                            } else {
64                                current_paragraph = Some(Paragraph::new());
65                            }
66                        }
67                        b"w:r" => {
68                            // Start of a run
69                            current_run = Some(Run::default());
70                        }
71                        b"w:t" => {
72                            // Text element
73                            in_text = true;
74                        }
75                        b"w:tbl" => {
76                            // Start of a table
77                            in_table = true;
78                            current_table = Some(Table::new());
79                        }
80                        b"w:tr" => {
81                            // Table row
82                            current_row = Some(TableRow::default());
83                        }
84                        b"w:tc" => {
85                            // Table cell
86                            current_cell = Some(TableCell::default());
87                        }
88                        b"w:b" => {
89                            // Bold formatting
90                            if let Some(ref mut run) = current_run {
91                                run.bold = true;
92                            }
93                        }
94                        b"w:i" => {
95                            // Italic formatting
96                            if let Some(ref mut run) = current_run {
97                                run.italic = true;
98                            }
99                        }
100                        b"w:u" => {
101                            // Underline formatting
102                            if let Some(ref mut run) = current_run {
103                                run.underline = true;
104                            }
105                        }
106                        _ => {}
107                    }
108                }
109                Ok(Event::Text(e)) => {
110                    if in_text {
111                        if let Some(ref mut run) = current_run {
112                            let text = e.unescape()?.into_owned();
113                            run.text.push_str(&text);
114                        }
115                    }
116                }
117                Ok(Event::End(ref e)) => {
118                    match e.name().as_ref() {
119                        b"w:t" => {
120                            in_text = false;
121                        }
122                        b"w:r" => {
123                            // End of a run
124                            if let Some(run) = current_run.take() {
125                                if in_table {
126                                    // Add run to table cell paragraph
127                                    if let Some(ref mut cell) = current_cell {
128                                        if cell.paragraphs.is_empty() {
129                                            cell.paragraphs.push(Paragraph::new());
130                                        }
131                                        if let Some(para) = cell.paragraphs.last_mut() {
132                                            para.add_run(run);
133                                        }
134                                    }
135                                } else if let Some(ref mut para) = current_paragraph {
136                                    para.add_run(run);
137                                }
138                            }
139                        }
140                        b"w:p" => {
141                            // End of a paragraph
142                            if in_table {
143                                // Paragraph inside table cell already handled
144                            } else if let Some(para) = current_paragraph.take() {
145                                document.paragraphs.push(para);
146                            }
147                        }
148                        b"w:tc" => {
149                            // End of table cell
150                            if let Some(cell) = current_cell.take() {
151                                if let Some(ref mut row) = current_row {
152                                    row.cells.push(cell);
153                                }
154                            }
155                        }
156                        b"w:tr" => {
157                            // End of table row
158                            if let Some(row) = current_row.take() {
159                                if let Some(ref mut table) = current_table {
160                                    table.rows.push(row);
161                                }
162                            }
163                        }
164                        b"w:tbl" => {
165                            // End of table
166                            in_table = false;
167                            if let Some(table) = current_table.take() {
168                                document.tables.push(table);
169                            }
170                        }
171                        _ => {}
172                    }
173                }
174                Ok(Event::Eof) => break,
175                Err(e) => return Err(e.into()),
176                _ => {}
177            }
178            buf.clear();
179        }
180
181        Ok(())
182    }
183}