docx_rs/reader/
mod.rs

1mod a_graphic;
2mod a_graphic_data;
3mod attributes;
4mod bookmark_end;
5mod bookmark_start;
6mod cell_margins;
7mod comment;
8mod comment_extended;
9mod comments;
10mod comments_extended;
11mod custom_properties;
12mod delete;
13mod div;
14mod doc_defaults;
15mod doc_grid;
16mod document;
17mod document_rels;
18mod drawing;
19mod errors;
20mod font_group;
21mod font_scheme;
22mod footer;
23mod frame_property;
24mod from_xml;
25mod header;
26mod header_or_footer_rels;
27mod hyperlink;
28mod ignore;
29mod insert;
30mod level;
31mod level_override;
32mod mc_fallback;
33mod numbering_property;
34mod numberings;
35mod page_num_type;
36mod paragraph;
37mod paragraph_property;
38mod paragraph_property_change;
39mod pic;
40mod positional_tab;
41mod read_zip;
42mod rels;
43mod run;
44mod run_property;
45mod section_property;
46mod settings;
47mod shading;
48mod shape;
49mod structured_data_tag;
50mod style;
51mod styles;
52mod tab;
53mod table;
54mod table_borders;
55mod table_cell;
56mod table_cell_borders;
57mod table_cell_margins;
58mod table_cell_property;
59mod table_position_property;
60mod table_property;
61mod table_row;
62mod tabs;
63mod text_box_content;
64mod theme;
65mod web_settings;
66mod wp_anchor;
67mod wps_shape;
68mod wps_text_box;
69mod xml_element;
70
71use std::{collections::HashMap, io::Cursor, path::PathBuf};
72
73use crate::documents::*;
74
75pub use attributes::*;
76pub use document_rels::*;
77pub use errors::ReaderError;
78pub use from_xml::*;
79pub use read_zip::*;
80pub use xml_element::*;
81use zip::ZipArchive;
82
83use self::header_or_footer_rels::{read_header_or_footer_rels, ReadHeaderOrFooterRels};
84
85// 2006
86const DOC_RELATIONSHIP_TYPE: &str =
87    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
88const CUSTOM_PROPERTIES_TYPE: &str =
89    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties";
90const STYLE_RELATIONSHIP_TYPE: &str =
91    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
92const NUMBERING_RELATIONSHIP_TYPE: &str =
93    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering";
94const SETTINGS_TYPE: &str =
95    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings";
96const COMMENTS_TYPE: &str =
97    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments";
98const WEB_SETTINGS_TYPE: &str =
99    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings";
100const HEADER_TYPE: &str =
101    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/header";
102const FOOTER_TYPE: &str =
103    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer";
104const THEME_TYPE: &str =
105    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme";
106const IMAGE_TYPE: &str =
107    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
108const HYPERLINK_TYPE: &str =
109    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
110// 2011
111const COMMENTS_EXTENDED_TYPE: &str =
112    "http://schemas.microsoft.com/office/2011/relationships/commentsExtended";
113
114fn read_headers(
115    rels: &ReadDocumentRels,
116    archive: &mut ZipArchive<Cursor<&[u8]>>,
117) -> HashMap<RId, (Header, ReadHeaderOrFooterRels)> {
118    let header_paths = rels.find_target_path(HEADER_TYPE);
119    let headers: HashMap<RId, (Header, ReadHeaderOrFooterRels)> = header_paths
120        .unwrap_or_default()
121        .into_iter()
122        .filter_map(|(rid, path, ..)| {
123            let data = read_zip(archive, path.to_str().expect("should have header path."));
124            if let Ok(d) = data {
125                if let Ok(h) = Header::from_xml(&d[..]) {
126                    let rels = read_header_or_footer_rels(archive, path).unwrap_or_default();
127                    return Some((rid, (h, rels)));
128                }
129            }
130            None
131        })
132        .collect();
133    headers
134}
135
136fn read_footers(
137    rels: &ReadDocumentRels,
138    archive: &mut ZipArchive<Cursor<&[u8]>>,
139) -> HashMap<RId, (Footer, ReadHeaderOrFooterRels)> {
140    let footer_paths = rels.find_target_path(FOOTER_TYPE);
141    let footers: HashMap<RId, (Footer, ReadHeaderOrFooterRels)> = footer_paths
142        .unwrap_or_default()
143        .into_iter()
144        .filter_map(|(rid, path, ..)| {
145            let data = read_zip(archive, path.to_str().expect("should have footer path."));
146            if let Ok(d) = data {
147                if let Ok(h) = Footer::from_xml(&d[..]) {
148                    let rels = read_header_or_footer_rels(archive, path).unwrap_or_default();
149                    return Some((rid, (h, rels)));
150                }
151            }
152            None
153        })
154        .collect();
155    footers
156}
157
158fn read_themes(rels: &ReadDocumentRels, archive: &mut ZipArchive<Cursor<&[u8]>>) -> Vec<Theme> {
159    let theme_paths = rels.find_target_path(THEME_TYPE);
160    theme_paths
161        .unwrap_or_default()
162        .into_iter()
163        .filter_map(|(_rid, path, ..)| {
164            let data = read_zip(archive, path.to_str().expect("should have footer path."));
165            if let Ok(d) = data {
166                if let Ok(h) = Theme::from_xml(&d[..]) {
167                    return Some(h);
168                }
169            }
170            None
171        })
172        .collect()
173}
174
175pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
176    let mut docx = Docx::new();
177    let cur = Cursor::new(buf);
178    let mut archive = zip::ZipArchive::new(cur)?;
179    // First, the content type for relationship parts and the Main Document part
180    // (the only required part) must be defined (physically located at /[Content_Types].xml in the package)
181    let _content_types = {
182        let data = read_zip(&mut archive, "[Content_Types].xml")?;
183        ContentTypes::from_xml(&data[..])?
184    };
185
186    // Next, the single required relationship (the package-level relationship to the Main Document part)
187    //  must be defined (physically located at /_rels/.rels in the package)
188    let rels = {
189        let data = read_zip(&mut archive, "_rels/.rels")?;
190        Rels::from_xml(&data[..])?
191    };
192
193    // Finally, the minimum content for the Main Document part must be defined
194    // (physically located at /document.xml in the package):
195    let main_rel = rels
196        .find_target(DOC_RELATIONSHIP_TYPE)
197        .ok_or(ReaderError::DocumentNotFoundError);
198
199    let document_path = if let Ok(rel) = main_rel {
200        rel.2.clone()
201    } else {
202        "word/document.xml".to_owned()
203    };
204
205    if let Some(custom_props) = rels.find_target(CUSTOM_PROPERTIES_TYPE) {
206        let data = read_zip(&mut archive, &custom_props.2);
207        if let Ok(data) = data {
208            if let Ok(custom) = CustomProps::from_xml(&data[..]) {
209                docx.doc_props.custom = custom;
210            }
211        }
212    }
213
214    let rels = read_document_rels(&mut archive, &document_path)?;
215
216    let headers = read_headers(&rels, &mut archive);
217    let footers = read_footers(&rels, &mut archive);
218
219    docx.themes = read_themes(&rels, &mut archive);
220
221    // Read commentsExtended
222    let comments_extended_path = rels.find_target_path(COMMENTS_EXTENDED_TYPE);
223    let comments_extended = if let Some(comments_extended_path) = comments_extended_path {
224        if let Some((_, comments_extended_path, ..)) = comments_extended_path.first() {
225            let data = read_zip(
226                &mut archive,
227                comments_extended_path
228                    .to_str()
229                    .expect("should have comments extended."),
230            );
231            if let Ok(data) = data {
232                CommentsExtended::from_xml(&data[..])?
233            } else {
234                CommentsExtended::default()
235            }
236        } else {
237            CommentsExtended::default()
238        }
239    } else {
240        CommentsExtended::default()
241    };
242
243    // Read comments
244    let comments_path = rels.find_target_path(COMMENTS_TYPE);
245    let comments = if let Some(paths) = comments_path {
246        if let Some((_, comments_path, ..)) = paths.first() {
247            let data = read_zip(
248                &mut archive,
249                comments_path.to_str().expect("should have comments."),
250            );
251            if let Ok(data) = data {
252                let mut comments = Comments::from_xml(&data[..])?.into_inner();
253                for i in 0..comments.len() {
254                    let c = &comments[i];
255                    let extended = comments_extended.children.iter().find(|ex| {
256                        for child in &c.children {
257                            if let CommentChild::Paragraph(p) = child {
258                                if ex.paragraph_id == p.id {
259                                    return true;
260                                }
261                            }
262                        }
263                        false
264                    });
265                    if let Some(CommentExtended {
266                        parent_paragraph_id: Some(parent_paragraph_id),
267                        ..
268                    }) = extended
269                    {
270                        if let Some(parent_comment) = comments.iter().find(|c| {
271                            for child in &c.children {
272                                if let CommentChild::Paragraph(p) = child {
273                                    if &p.id == parent_paragraph_id {
274                                        return true;
275                                    }
276                                }
277                            }
278                            false
279                        }) {
280                            comments[i].parent_comment_id = Some(parent_comment.id);
281                        }
282                    }
283                }
284                Comments { comments }
285            } else {
286                Comments::default()
287            }
288        } else {
289            Comments::default()
290        }
291    } else {
292        Comments::default()
293    };
294
295    let document = {
296        let data = read_zip(&mut archive, &document_path)?;
297        Document::from_xml(&data[..])?
298    };
299    docx = docx.document(document);
300
301    // assign headers
302    if let Some(h) = docx.document.section_property.header_reference.clone() {
303        if let Some((header, rels)) = headers.get(&h.id) {
304            docx.document = docx.document.header(header.clone(), &h.id);
305            let count = docx.document_rels.header_count + 1;
306            docx.document_rels.header_count = count;
307            docx.content_type = docx.content_type.add_header();
308            // Read media
309            let media = rels.find_target_path(IMAGE_TYPE);
310            docx = add_images(docx, media, &mut archive);
311        }
312    }
313    if let Some(ref h) = docx
314        .document
315        .section_property
316        .first_header_reference
317        .clone()
318    {
319        if let Some((header, rels)) = headers.get(&h.id) {
320            docx.document = docx
321                .document
322                .first_header_without_title_pg(header.clone(), &h.id);
323            let count = docx.document_rels.header_count + 1;
324            docx.document_rels.header_count = count;
325            docx.content_type = docx.content_type.add_header();
326            // Read media
327            let media = rels.find_target_path(IMAGE_TYPE);
328            docx = add_images(docx, media, &mut archive);
329        }
330    }
331    if let Some(ref h) = docx.document.section_property.even_header_reference.clone() {
332        if let Some((header, rels)) = headers.get(&h.id) {
333            docx.document = docx.document.even_header(header.clone(), &h.id);
334            let count = docx.document_rels.header_count + 1;
335            docx.document_rels.header_count = count;
336            docx.content_type = docx.content_type.add_header();
337
338            // Read media
339            let media = rels.find_target_path(IMAGE_TYPE);
340            docx = add_images(docx, media, &mut archive);
341        }
342    }
343
344    // assign footers
345    if let Some(f) = docx.document.section_property.footer_reference.clone() {
346        if let Some((footer, rels)) = footers.get(&f.id) {
347            docx.document = docx.document.footer(footer.clone(), &f.id);
348            let count = docx.document_rels.footer_count + 1;
349            docx.document_rels.footer_count = count;
350            docx.content_type = docx.content_type.add_footer();
351
352            // Read media
353            let media = rels.find_target_path(IMAGE_TYPE);
354            docx = add_images(docx, media, &mut archive);
355        }
356    }
357
358    if let Some(ref f) = docx
359        .document
360        .section_property
361        .first_footer_reference
362        .clone()
363    {
364        if let Some((footer, rels)) = footers.get(&f.id) {
365            docx.document = docx
366                .document
367                .first_footer_without_title_pg(footer.clone(), &f.id);
368            let count = docx.document_rels.footer_count + 1;
369            docx.document_rels.footer_count = count;
370            docx.content_type = docx.content_type.add_footer();
371
372            // Read media
373            let media = rels.find_target_path(IMAGE_TYPE);
374            docx = add_images(docx, media, &mut archive);
375        }
376    }
377    if let Some(ref f) = docx.document.section_property.even_footer_reference.clone() {
378        if let Some((footer, rels)) = footers.get(&f.id) {
379            docx.document = docx.document.even_footer(footer.clone(), &f.id);
380            let count = docx.document_rels.footer_count + 1;
381            docx.document_rels.footer_count = count;
382            docx.content_type = docx.content_type.add_footer();
383
384            // Read media
385            let media = rels.find_target_path(IMAGE_TYPE);
386            docx = add_images(docx, media, &mut archive);
387        }
388    }
389
390    // store comments to paragraphs.
391    if !comments.inner().is_empty() {
392        docx.store_comments(comments.inner());
393        docx = docx.comments(comments);
394        docx = docx.comments_extended(comments_extended);
395    }
396
397    // Read document relationships
398    // Read styles
399    let style_path = rels.find_target_path(STYLE_RELATIONSHIP_TYPE);
400    if let Some(paths) = style_path {
401        if let Some((_, style_path, ..)) = paths.first() {
402            let data = read_zip(
403                &mut archive,
404                style_path.to_str().expect("should have styles"),
405            )?;
406            let styles = Styles::from_xml(&data[..])?;
407            docx = docx.styles(styles);
408        }
409    }
410
411    // Read numberings
412    let num_path = rels.find_target_path(NUMBERING_RELATIONSHIP_TYPE);
413    if let Some(paths) = num_path {
414        if let Some((_, num_path, ..)) = paths.first() {
415            let data = read_zip(
416                &mut archive,
417                num_path.to_str().expect("should have numberings"),
418            )?;
419            let nums = Numberings::from_xml(&data[..])?;
420            docx = docx.numberings(nums);
421        }
422    }
423
424    // Read settings
425    let settings_path = rels.find_target_path(SETTINGS_TYPE);
426    if let Some(paths) = settings_path {
427        if let Some((_, settings_path, ..)) = paths.first() {
428            let data = read_zip(
429                &mut archive,
430                settings_path.to_str().expect("should have settings"),
431            )?;
432            let settings = Settings::from_xml(&data[..])?;
433            docx = docx.settings(settings);
434        }
435    }
436
437    // Read web settings
438    let web_settings_path = rels.find_target_path(WEB_SETTINGS_TYPE);
439    if let Some(paths) = web_settings_path {
440        if let Some((_, web_settings_path, ..)) = paths.first() {
441            let data = read_zip(
442                &mut archive,
443                web_settings_path
444                    .to_str()
445                    .expect("should have web settings"),
446            )?;
447            let web_settings = WebSettings::from_xml(&data[..])?;
448            docx = docx.web_settings(web_settings);
449        }
450    }
451    // Read media
452    let media = rels.find_target_path(IMAGE_TYPE);
453    docx = add_images(docx, media, &mut archive);
454
455    // Read hyperlinks
456    let links = rels.find_target_path(HYPERLINK_TYPE);
457    if let Some(paths) = links {
458        for (id, target, mode) in paths {
459            if let Some(mode) = mode {
460                docx =
461                    docx.add_hyperlink(id, target.to_str().expect("should convert to str"), mode);
462            }
463        }
464    }
465
466    Ok(docx)
467}
468
469fn add_images(
470    mut docx: Docx,
471    media: Option<Vec<(RId, PathBuf, Option<String>)>>,
472    archive: &mut ZipArchive<Cursor<&[u8]>>,
473) -> Docx {
474    // Read media
475    if let Some(paths) = media {
476        for (id, media, ..) in paths {
477            if let Ok(data) = read_zip(archive, media.to_str().expect("should have media")) {
478                docx = docx.add_image(id, media.to_str().unwrap().to_string(), data);
479            }
480        }
481    }
482    docx
483}
docx_rs/reader/mod.rs

docx_rs/reader/
mod.rs