linch_docx_rs/document/
mod.rs

1//! Document model - high-level API for DOCX documents
2
3mod body;
4mod numbering;
5mod paragraph;
6mod run;
7mod table;
8
9pub use body::{BlockContent, Body};
10pub use numbering::{AbstractNum, Level, LevelOverride, Num, NumberFormat, Numbering};
11pub use paragraph::{Hyperlink, Paragraph, ParagraphContent, ParagraphProperties};
12pub use run::{BreakType, Run, RunContent, RunProperties};
13pub use table::{GridColumn, Table, TableCell, TableCellProperties, TableRow, VMerge};
14
15use crate::error::{Error, Result};
16use crate::opc::{Package, Part, PartUri};
17use crate::xml;
18use quick_xml::events::{BytesDecl, BytesEnd, BytesStart, Event};
19use quick_xml::{Reader, Writer};
20use std::io::{BufRead, Cursor};
21use std::path::Path;
22
23/// A DOCX document
24#[derive(Debug)]
25pub struct Document {
26    /// Underlying OPC package
27    package: Package,
28    /// Parsed document body
29    body: Body,
30    /// Numbering definitions (from numbering.xml)
31    numbering: Option<Numbering>,
32}
33
34impl Document {
35    /// Open a document from a file path
36    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
37        let package = Package::open(path)?;
38        Self::from_package(package)
39    }
40
41    /// Open a document from bytes
42    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
43        let package = Package::from_bytes(bytes)?;
44        Self::from_package(package)
45    }
46
47    /// Create document from an OPC package
48    fn from_package(package: Package) -> Result<Self> {
49        // Get main document part
50        let doc_part = package
51            .main_document_part()
52            .ok_or_else(|| Error::MissingPart("Main document part not found".into()))?;
53
54        // Parse document.xml
55        let xml = doc_part.data_as_str()?;
56        let body = parse_document_xml(xml)?;
57
58        // Try to load numbering.xml
59        let numbering = Self::load_numbering(&package);
60
61        Ok(Self {
62            package,
63            body,
64            numbering,
65        })
66    }
67
68    /// Load numbering definitions from the package
69    fn load_numbering(package: &Package) -> Option<Numbering> {
70        // First find the numbering part through relationships
71        let doc_part = package.main_document_part()?;
72        let rels = doc_part.relationships()?;
73        let numbering_rel = rels.by_type(crate::opc::rel_types::NUMBERING)?;
74
75        // Resolve the target URI
76        let target = &numbering_rel.target;
77        let numbering_uri = if target.starts_with('/') {
78            PartUri::new(target).ok()?
79        } else {
80            PartUri::new(&format!("/word/{}", target)).ok()?
81        };
82
83        // Get the numbering part
84        let numbering_part = package.part(&numbering_uri)?;
85        let xml = numbering_part.data_as_str().ok()?;
86
87        // Parse numbering.xml
88        Numbering::from_xml(xml).ok()
89    }
90
91    /// Create a new empty document
92    pub fn new() -> Self {
93        Self {
94            package: Package::new(),
95            body: Body::default(),
96            numbering: None,
97        }
98    }
99
100    /// Save the document to a file
101    pub fn save<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
102        self.update_package()?;
103        self.package.save(path)
104    }
105
106    /// Save the document to bytes
107    pub fn to_bytes(&mut self) -> Result<Vec<u8>> {
108        self.update_package()?;
109        self.package.to_bytes()
110    }
111
112    /// Update the package with current body content
113    fn update_package(&mut self) -> Result<()> {
114        let xml = serialize_document_xml(&self.body)?;
115        let uri = PartUri::new("/word/document.xml")?;
116
117        // Update or add the document part
118        let part = Part::new(
119            uri.clone(),
120            crate::opc::MAIN_DOCUMENT.to_string(),
121            xml.into_bytes(),
122        );
123        self.package.add_part(part);
124
125        // Ensure the relationship exists for the main document
126        if self.package.main_document_part().is_none() {
127            use crate::opc::rel_types;
128            self.package
129                .add_relationship(rel_types::OFFICE_DOCUMENT, uri.as_str());
130        }
131
132        // Update numbering.xml if present
133        if let Some(ref numbering) = self.numbering {
134            let numbering_xml = numbering.to_xml()?;
135            let numbering_uri = PartUri::new("/word/numbering.xml")?;
136            let numbering_part = Part::new(
137                numbering_uri,
138                crate::opc::NUMBERING.to_string(),
139                numbering_xml.into_bytes(),
140            );
141            self.package.add_part(numbering_part);
142        }
143
144        Ok(())
145    }
146
147    /// Get all paragraphs
148    pub fn paragraphs(&self) -> impl Iterator<Item = &Paragraph> {
149        self.body.paragraphs()
150    }
151
152    /// Get paragraph count
153    pub fn paragraph_count(&self) -> usize {
154        self.body
155            .content
156            .iter()
157            .filter(|c| matches!(c, BlockContent::Paragraph(_)))
158            .count()
159    }
160
161    /// Get paragraph by index
162    pub fn paragraph(&self, index: usize) -> Option<&Paragraph> {
163        self.body.paragraphs().nth(index)
164    }
165
166    /// Get all tables
167    pub fn tables(&self) -> impl Iterator<Item = &Table> {
168        self.body.tables()
169    }
170
171    /// Get table count
172    pub fn table_count(&self) -> usize {
173        self.body
174            .content
175            .iter()
176            .filter(|c| matches!(c, BlockContent::Table(_)))
177            .count()
178    }
179
180    /// Get table by index
181    pub fn table(&self, index: usize) -> Option<&Table> {
182        self.body.tables().nth(index)
183    }
184
185    /// Get all text in the document
186    pub fn text(&self) -> String {
187        self.body
188            .paragraphs()
189            .map(|p| p.text())
190            .collect::<Vec<_>>()
191            .join("\n")
192    }
193
194    /// Get the underlying package
195    pub fn package(&self) -> &Package {
196        &self.package
197    }
198
199    /// Get mutable body
200    pub fn body_mut(&mut self) -> &mut Body {
201        &mut self.body
202    }
203
204    /// Add a paragraph with text
205    pub fn add_paragraph(&mut self, text: impl Into<String>) -> &mut Paragraph {
206        let para = Paragraph::new(text);
207        self.body.add_paragraph(para);
208        // Return mutable reference to the last paragraph
209        self.body
210            .content
211            .iter_mut()
212            .rev()
213            .find_map(|c| {
214                if let BlockContent::Paragraph(p) = c {
215                    Some(p)
216                } else {
217                    None
218                }
219            })
220            .expect("Just added paragraph")
221    }
222
223    /// Add an empty paragraph
224    pub fn add_empty_paragraph(&mut self) -> &mut Paragraph {
225        self.body.add_paragraph(Paragraph::default());
226        self.body
227            .content
228            .iter_mut()
229            .rev()
230            .find_map(|c| {
231                if let BlockContent::Paragraph(p) = c {
232                    Some(p)
233                } else {
234                    None
235                }
236            })
237            .expect("Just added paragraph")
238    }
239
240    /// Get numbering definitions
241    pub fn numbering(&self) -> Option<&Numbering> {
242        self.numbering.as_ref()
243    }
244
245    /// Get mutable numbering definitions
246    pub fn numbering_mut(&mut self) -> Option<&mut Numbering> {
247        self.numbering.as_mut()
248    }
249
250    /// Check if a paragraph is a list item
251    pub fn is_list_item(&self, para: &Paragraph) -> bool {
252        para.properties.as_ref().and_then(|p| p.num_id).is_some()
253    }
254
255    /// Check if a paragraph is a bullet list item
256    pub fn is_bullet_list_item(&self, para: &Paragraph) -> bool {
257        if let Some(num_id) = para.properties.as_ref().and_then(|p| p.num_id) {
258            if let Some(ref numbering) = self.numbering {
259                return numbering.is_bullet_list(num_id);
260            }
261        }
262        false
263    }
264
265    /// Get the list level of a paragraph (0-8), or None if not a list item
266    pub fn list_level(&self, para: &Paragraph) -> Option<u32> {
267        para.properties.as_ref().and_then(|p| {
268            if p.num_id.is_some() {
269                Some(p.num_level.unwrap_or(0))
270            } else {
271                None
272            }
273        })
274    }
275
276    /// Get the number format for a list item
277    pub fn list_format(&self, para: &Paragraph) -> Option<&NumberFormat> {
278        let props = para.properties.as_ref()?;
279        let num_id = props.num_id?;
280        let level = props.num_level.unwrap_or(0) as u8;
281        self.numbering.as_ref()?.get_format(num_id, level)
282    }
283
284    /// Add a table to the document
285    pub fn add_table(&mut self, table: Table) -> &mut Table {
286        self.body.add_table(table);
287        // Return mutable reference to the last table
288        self.body
289            .content
290            .iter_mut()
291            .rev()
292            .find_map(|c| {
293                if let BlockContent::Table(t) = c {
294                    Some(t)
295                } else {
296                    None
297                }
298            })
299            .expect("Just added table")
300    }
301
302    /// Create and add a table with specified rows and columns
303    pub fn add_table_with_size(&mut self, rows: usize, cols: usize) -> &mut Table {
304        self.add_table(Table::new(rows, cols))
305    }
306
307    /// Get mutable table by index
308    pub fn table_mut(&mut self, index: usize) -> Option<&mut Table> {
309        self.body
310            .content
311            .iter_mut()
312            .filter_map(|c| {
313                if let BlockContent::Table(t) = c {
314                    Some(t)
315                } else {
316                    None
317                }
318            })
319            .nth(index)
320    }
321}
322
323impl Default for Document {
324    fn default() -> Self {
325        Self::new()
326    }
327}
328
329/// Parse document.xml content
330fn parse_document_xml(xml: &str) -> Result<Body> {
331    let mut reader = Reader::from_str(xml);
332    reader.config_mut().trim_text(true);
333
334    let mut buf = Vec::new();
335    let mut body = None;
336
337    loop {
338        match reader.read_event_into(&mut buf)? {
339            Event::Start(e) => {
340                let name = e.name();
341                let local = name.local_name();
342
343                match local.as_ref() {
344                    b"body" => {
345                        body = Some(Body::from_reader(&mut reader)?);
346                    }
347                    b"document" => {
348                        // Continue to find body
349                    }
350                    _ => {
351                        // Skip unknown elements at document level
352                        skip_element(&mut reader, &e)?;
353                    }
354                }
355            }
356            Event::Eof => break,
357            _ => {}
358        }
359        buf.clear();
360    }
361
362    body.ok_or_else(|| Error::InvalidDocument("Missing w:body element".into()))
363}
364
365/// Serialize body to document.xml content
366fn serialize_document_xml(body: &Body) -> Result<String> {
367    let mut buffer = Cursor::new(Vec::new());
368    let mut writer = Writer::new(&mut buffer);
369
370    // XML declaration
371    writer.write_event(Event::Decl(BytesDecl::new(
372        "1.0",
373        Some("UTF-8"),
374        Some("yes"),
375    )))?;
376
377    // Document element with namespaces
378    let mut doc_start = BytesStart::new("w:document");
379    for (attr, value) in xml::document_namespaces() {
380        doc_start.push_attribute((attr, value));
381    }
382    writer.write_event(Event::Start(doc_start))?;
383
384    // Body
385    body.write_to(&mut writer)?;
386
387    // Close document
388    writer.write_event(Event::End(BytesEnd::new("w:document")))?;
389
390    let xml_bytes = buffer.into_inner();
391    String::from_utf8(xml_bytes).map_err(|e| Error::InvalidDocument(e.to_string()))
392}
393
394/// Skip an element and all its children
395fn skip_element<R: BufRead>(
396    reader: &mut Reader<R>,
397    start: &quick_xml::events::BytesStart,
398) -> Result<()> {
399    let target = start.name().as_ref().to_vec();
400    let mut depth = 1;
401    let mut buf = Vec::new();
402
403    loop {
404        match reader.read_event_into(&mut buf)? {
405            Event::Start(e) if e.name().as_ref() == target => depth += 1,
406            Event::End(e) if e.name().as_ref() == target => {
407                depth -= 1;
408                if depth == 0 {
409                    break;
410                }
411            }
412            Event::Eof => break,
413            _ => {}
414        }
415        buf.clear();
416    }
417
418    Ok(())
419}
420
421#[cfg(test)]
422mod tests {
423    use super::*;
424
425    const SIMPLE_DOC: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
426<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
427  <w:body>
428    <w:p>
429      <w:r>
430        <w:t>Hello, World!</w:t>
431      </w:r>
432    </w:p>
433    <w:p>
434      <w:pPr>
435        <w:pStyle w:val="Heading1"/>
436      </w:pPr>
437      <w:r>
438        <w:rPr>
439          <w:b/>
440        </w:rPr>
441        <w:t>This is a heading</w:t>
442      </w:r>
443    </w:p>
444  </w:body>
445</w:document>"#;
446
447    #[test]
448    fn test_parse_simple_document() {
449        let body = parse_document_xml(SIMPLE_DOC).unwrap();
450
451        // Should have 2 paragraphs
452        let paras: Vec<_> = body.paragraphs().collect();
453        assert_eq!(paras.len(), 2);
454
455        // First paragraph
456        assert_eq!(paras[0].text(), "Hello, World!");
457
458        // Second paragraph
459        assert_eq!(paras[1].text(), "This is a heading");
460        assert_eq!(paras[1].style(), Some("Heading1"));
461
462        // Check run properties
463        let runs: Vec<_> = paras[1].runs().collect();
464        assert_eq!(runs.len(), 1);
465        assert!(runs[0].bold());
466    }
467
468    #[test]
469    fn test_parse_with_formatting() {
470        let xml = r#"<?xml version="1.0"?>
471<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
472  <w:body>
473    <w:p>
474      <w:r>
475        <w:rPr>
476          <w:b/>
477          <w:i/>
478          <w:sz w:val="28"/>
479          <w:color w:val="FF0000"/>
480        </w:rPr>
481        <w:t>Formatted text</w:t>
482      </w:r>
483    </w:p>
484  </w:body>
485</w:document>"#;
486
487        let body = parse_document_xml(xml).unwrap();
488        let para = body.paragraphs().next().unwrap();
489        let run = para.runs().next().unwrap();
490
491        assert!(run.bold());
492        assert!(run.italic());
493        assert_eq!(run.font_size_pt(), Some(14.0)); // 28 half-points = 14pt
494        assert_eq!(run.color(), Some("FF0000"));
495    }
496}