1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/// DocumentPart - the main document.xml part of a Word document.
use crate::ooxml::docx::paragraph::Paragraph;
use crate::ooxml::docx::table::Table;
use crate::ooxml::error::{OoxmlError, Result};
use crate::ooxml::opc::part::Part;
use quick_xml::events::Event;
use quick_xml::Reader;
use smallvec::SmallVec;
/// The main document part of a Word document.
///
/// This corresponds to the `/word/document.xml` part in the package.
/// It contains the main document content including paragraphs, tables,
/// sections, and other block-level elements.
pub struct DocumentPart<'a> {
/// Reference to the underlying part
part: &'a dyn Part,
}
impl<'a> DocumentPart<'a> {
/// Create a DocumentPart from a Part.
///
/// # Arguments
///
/// * `part` - The part containing the document.xml content
pub fn from_part(part: &'a dyn Part) -> Result<Self> {
Ok(Self { part })
}
/// Get the XML bytes of the document.
#[inline]
pub fn xml_bytes(&self) -> &[u8] {
self.part.blob()
}
/// Extract all paragraph text from the document.
///
/// This performs a quick extraction of all text content by finding
/// `<w:t>` elements in the XML.
///
/// # Performance
///
/// Uses `quick-xml` for efficient streaming XML parsing with pre-allocated
/// buffer and unsafe string conversion for optimal performance.
pub fn extract_text(&self) -> Result<String> {
let mut reader = Reader::from_reader(self.xml_bytes());
reader.config_mut().trim_text(true);
// Pre-allocate with estimated capacity to reduce reallocations
let estimated_capacity = self.xml_bytes().len() / 8; // Rough estimate for text content
let mut result = String::with_capacity(estimated_capacity);
let mut in_text_element = false;
let mut buf = Vec::with_capacity(512); // Reusable buffer
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
// Check if this is a w:t element
if e.local_name().as_ref() == b"t" {
in_text_element = true;
}
}
Ok(Event::Text(e)) if in_text_element => {
// Extract text content - use unsafe conversion for better performance
let text = unsafe { std::str::from_utf8_unchecked(e.as_ref()) };
result.push_str(text);
}
Ok(Event::End(e)) => {
if e.local_name().as_ref() == b"t" {
in_text_element = false;
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(OoxmlError::Xml(e.to_string())),
_ => {}
}
buf.clear();
}
Ok(result)
}
/// Count the number of paragraphs in the document.
///
/// Counts `<w:p>` elements in the document body.
pub fn paragraph_count(&self) -> Result<usize> {
let mut reader = Reader::from_reader(self.xml_bytes());
reader.config_mut().trim_text(true);
let mut count = 0;
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
if e.local_name().as_ref() == b"p" {
count += 1;
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(OoxmlError::Xml(e.to_string())),
_ => {}
}
buf.clear();
}
Ok(count)
}
/// Count the number of tables in the document.
///
/// Counts `<w:tbl>` elements in the document body.
pub fn table_count(&self) -> Result<usize> {
let mut reader = Reader::from_reader(self.xml_bytes());
reader.config_mut().trim_text(true);
let mut count = 0;
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
if e.local_name().as_ref() == b"tbl" {
count += 1;
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(OoxmlError::Xml(e.to_string())),
_ => {}
}
buf.clear();
}
Ok(count)
}
/// Get all paragraphs in the document.
///
/// Extracts all `<w:p>` elements from the document body.
///
/// # Performance
///
/// Uses streaming XML parsing with pre-allocated SmallVec for efficient
/// storage of typically small paragraph collections.
pub fn paragraphs(&self) -> Result<SmallVec<[Paragraph; 32]>> {
let mut reader = Reader::from_reader(self.xml_bytes());
reader.config_mut().trim_text(true);
// Use SmallVec for efficient storage of paragraph collections
let mut paragraphs = SmallVec::new();
let mut current_para_xml = Vec::with_capacity(2048); // Pre-allocate for paragraph XML
let mut in_para = false;
let mut depth = 0;
let mut buf = Vec::with_capacity(512); // Reusable buffer
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
if e.local_name().as_ref() == b"p" && !in_para {
in_para = true;
depth = 1;
current_para_xml.clear();
current_para_xml.extend_from_slice(b"<w:p");
for attr in e.attributes().flatten() {
current_para_xml.push(b' ');
current_para_xml.extend_from_slice(attr.key.as_ref());
current_para_xml.extend_from_slice(b"=\"");
current_para_xml.extend_from_slice(&attr.value);
current_para_xml.push(b'"');
}
current_para_xml.push(b'>');
} else if in_para {
depth += 1;
current_para_xml.push(b'<');
current_para_xml.extend_from_slice(e.name().as_ref());
for attr in e.attributes().flatten() {
current_para_xml.push(b' ');
current_para_xml.extend_from_slice(attr.key.as_ref());
current_para_xml.extend_from_slice(b"=\"");
current_para_xml.extend_from_slice(&attr.value);
current_para_xml.push(b'"');
}
current_para_xml.push(b'>');
}
}
Ok(Event::End(e)) => {
if in_para {
current_para_xml.extend_from_slice(b"</");
current_para_xml.extend_from_slice(e.name().as_ref());
current_para_xml.push(b'>');
depth -= 1;
if depth == 0 && e.local_name().as_ref() == b"p" {
paragraphs.push(Paragraph::new(current_para_xml.clone()));
in_para = false;
}
}
}
Ok(Event::Text(e)) if in_para => {
current_para_xml.extend_from_slice(e.as_ref());
}
Ok(Event::Empty(e)) if in_para => {
current_para_xml.push(b'<');
current_para_xml.extend_from_slice(e.name().as_ref());
for attr in e.attributes().flatten() {
current_para_xml.push(b' ');
current_para_xml.extend_from_slice(attr.key.as_ref());
current_para_xml.extend_from_slice(b"=\"");
current_para_xml.extend_from_slice(&attr.value);
current_para_xml.push(b'"');
}
current_para_xml.extend_from_slice(b"/>");
}
Ok(Event::Eof) => break,
Err(e) => return Err(OoxmlError::Xml(e.to_string())),
_ => {}
}
buf.clear();
}
Ok(paragraphs)
}
/// Get all tables in the document.
///
/// Extracts all `<w:tbl>` elements from the document body.
///
/// # Performance
///
/// Uses SmallVec for efficient storage of typically small table collections.
pub fn tables(&self) -> Result<SmallVec<[Table; 8]>> {
let mut reader = Reader::from_reader(self.xml_bytes());
reader.config_mut().trim_text(true);
// Use SmallVec for efficient storage of table collections
let mut tables = SmallVec::new();
let mut current_table_xml = Vec::with_capacity(4096); // Pre-allocate for table XML
let mut in_table = false;
let mut depth = 0;
let mut buf = Vec::with_capacity(512); // Reusable buffer
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
if e.local_name().as_ref() == b"tbl" && !in_table {
in_table = true;
depth = 1;
current_table_xml.clear();
current_table_xml.extend_from_slice(b"<w:tbl");
for attr in e.attributes().flatten() {
current_table_xml.push(b' ');
current_table_xml.extend_from_slice(attr.key.as_ref());
current_table_xml.extend_from_slice(b"=\"");
current_table_xml.extend_from_slice(&attr.value);
current_table_xml.push(b'"');
}
current_table_xml.push(b'>');
} else if in_table {
depth += 1;
current_table_xml.push(b'<');
current_table_xml.extend_from_slice(e.name().as_ref());
for attr in e.attributes().flatten() {
current_table_xml.push(b' ');
current_table_xml.extend_from_slice(attr.key.as_ref());
current_table_xml.extend_from_slice(b"=\"");
current_table_xml.extend_from_slice(&attr.value);
current_table_xml.push(b'"');
}
current_table_xml.push(b'>');
}
}
Ok(Event::End(e)) => {
if in_table {
current_table_xml.extend_from_slice(b"</");
current_table_xml.extend_from_slice(e.name().as_ref());
current_table_xml.push(b'>');
depth -= 1;
if depth == 0 && e.local_name().as_ref() == b"tbl" {
tables.push(Table::new(current_table_xml.clone()));
in_table = false;
}
}
}
Ok(Event::Text(e)) if in_table => {
current_table_xml.extend_from_slice(e.as_ref());
}
Ok(Event::Empty(e)) if in_table => {
current_table_xml.push(b'<');
current_table_xml.extend_from_slice(e.name().as_ref());
for attr in e.attributes().flatten() {
current_table_xml.push(b' ');
current_table_xml.extend_from_slice(attr.key.as_ref());
current_table_xml.extend_from_slice(b"=\"");
current_table_xml.extend_from_slice(&attr.value);
current_table_xml.push(b'"');
}
current_table_xml.extend_from_slice(b"/>");
}
Ok(Event::Eof) => break,
Err(e) => return Err(OoxmlError::Xml(e.to_string())),
_ => {}
}
buf.clear();
}
Ok(tables)
}
}
#[cfg(test)]
mod tests {
// Tests will be added as we have a way to construct test XmlParts
}