docx_lite/parser.rs
1use std::io::{Read, Seek};
2use quick_xml::events::Event;
3use quick_xml::Reader;
4use zip::ZipArchive;
5
6use crate::error::{DocxError, Result};
7use crate::types::{Document, Paragraph, Run, Table, TableRow, TableCell};
8
9pub struct DocxParser<R: Read + Seek> {
10 archive: ZipArchive<R>,
11}
12
13impl<R: Read + Seek> DocxParser<R> {
14 pub fn new(reader: R) -> Result<Self> {
15 let archive = ZipArchive::new(reader)?;
16 Ok(Self { archive })
17 }
18
19 pub fn parse(mut self) -> Result<Document> {
20 let mut document = Document::new();
21
22 // Extract main document content
23 let document_xml = self.read_document_xml()?;
24 self.parse_document_xml(&document_xml, &mut document)?;
25
26 Ok(document)
27 }
28
29 fn read_document_xml(&mut self) -> Result<String> {
30 let mut file = self.archive
31 .by_name("word/document.xml")
32 .map_err(|_| DocxError::FileNotFound("word/document.xml".to_string()))?;
33
34 let mut contents = String::new();
35 file.read_to_string(&mut contents)?;
36 Ok(contents)
37 }
38
39 fn parse_document_xml(&self, xml: &str, document: &mut Document) -> Result<()> {
40 let mut reader = Reader::from_str(xml);
41 reader.config_mut().trim_text(true);
42
43 let mut buf = Vec::new();
44 let mut current_paragraph: Option<Paragraph> = None;
45 let mut current_run: Option<Run> = None;
46 let mut current_table: Option<Table> = None;
47 let mut current_row: Option<TableRow> = None;
48 let mut current_cell: Option<TableCell> = None;
49 let mut in_text = false;
50 let mut in_table = false;
51
52 loop {
53 match reader.read_event_into(&mut buf) {
54 Ok(Event::Start(ref e)) => {
55 match e.name().as_ref() {
56 b"w:p" => {
57 // Start of a paragraph
58 if in_table {
59 // Paragraph inside a table cell
60 if current_cell.is_none() {
61 current_cell = Some(TableCell::default());
62 }
63 } else {
64 current_paragraph = Some(Paragraph::new());
65 }
66 }
67 b"w:r" => {
68 // Start of a run
69 current_run = Some(Run::default());
70 }
71 b"w:t" => {
72 // Text element
73 in_text = true;
74 }
75 b"w:tbl" => {
76 // Start of a table
77 in_table = true;
78 current_table = Some(Table::new());
79 }
80 b"w:tr" => {
81 // Table row
82 current_row = Some(TableRow::default());
83 }
84 b"w:tc" => {
85 // Table cell
86 current_cell = Some(TableCell::default());
87 }
88 b"w:b" => {
89 // Bold formatting
90 if let Some(ref mut run) = current_run {
91 run.bold = true;
92 }
93 }
94 b"w:i" => {
95 // Italic formatting
96 if let Some(ref mut run) = current_run {
97 run.italic = true;
98 }
99 }
100 b"w:u" => {
101 // Underline formatting
102 if let Some(ref mut run) = current_run {
103 run.underline = true;
104 }
105 }
106 _ => {}
107 }
108 }
109 Ok(Event::Text(e)) => {
110 if in_text {
111 if let Some(ref mut run) = current_run {
112 let text = e.unescape()?.into_owned();
113 run.text.push_str(&text);
114 }
115 }
116 }
117 Ok(Event::End(ref e)) => {
118 match e.name().as_ref() {
119 b"w:t" => {
120 in_text = false;
121 }
122 b"w:r" => {
123 // End of a run
124 if let Some(run) = current_run.take() {
125 if in_table {
126 // Add run to table cell paragraph
127 if let Some(ref mut cell) = current_cell {
128 if cell.paragraphs.is_empty() {
129 cell.paragraphs.push(Paragraph::new());
130 }
131 if let Some(para) = cell.paragraphs.last_mut() {
132 para.add_run(run);
133 }
134 }
135 } else if let Some(ref mut para) = current_paragraph {
136 para.add_run(run);
137 }
138 }
139 }
140 b"w:p" => {
141 // End of a paragraph
142 if in_table {
143 // Paragraph inside table cell already handled
144 } else if let Some(para) = current_paragraph.take() {
145 document.paragraphs.push(para);
146 }
147 }
148 b"w:tc" => {
149 // End of table cell
150 if let Some(cell) = current_cell.take() {
151 if let Some(ref mut row) = current_row {
152 row.cells.push(cell);
153 }
154 }
155 }
156 b"w:tr" => {
157 // End of table row
158 if let Some(row) = current_row.take() {
159 if let Some(ref mut table) = current_table {
160 table.rows.push(row);
161 }
162 }
163 }
164 b"w:tbl" => {
165 // End of table
166 in_table = false;
167 if let Some(table) = current_table.take() {
168 document.tables.push(table);
169 }
170 }
171 _ => {}
172 }
173 }
174 Ok(Event::Eof) => break,
175 Err(e) => return Err(e.into()),
176 _ => {}
177 }
178 buf.clear();
179 }
180
181 Ok(())
182 }
183}