docx_reader/reader/
mod.rs

1mod a_graphic;
2mod a_graphic_data;
3mod attributes;
4mod custom_properties;
5mod delete;
6mod div;
7mod doc_defaults;
8mod doc_grid;
9mod document;
10mod document_rels;
11mod drawing;
12mod errors;
13mod font_group;
14mod font_scheme;
15mod footer;
16mod from_xml;
17mod header;
18mod hyperlink;
19mod ignore;
20mod insert;
21mod level;
22mod level_override;
23mod mc_fallback;
24mod numbering_property;
25mod numberings;
26mod paragraph;
27mod paragraph_property;
28mod paragraph_property_change;
29mod pic;
30mod read_zip;
31mod rels;
32mod run;
33mod run_property;
34mod section_property;
35mod settings;
36mod shading;
37mod shape;
38mod structured_data_tag;
39mod style;
40mod styles;
41mod tab;
42mod table;
43mod table_borders;
44mod table_cell;
45mod table_cell_borders;
46mod table_cell_margins;
47mod table_cell_property;
48mod table_property;
49mod table_row;
50mod tabs;
51mod text_box_content;
52mod theme;
53mod web_settings;
54mod wp_anchor;
55mod wps_shape;
56mod wps_text_box;
57mod xml_element;
58
59use std::{collections::HashMap, io::Cursor};
60use zip::ZipArchive;
61
62use crate::documents::*;
63
64pub use attributes::*;
65pub use document_rels::*;
66pub use errors::ReaderError;
67pub use from_xml::*;
68pub use mc_fallback::*;
69pub use read_zip::*;
70pub use xml_element::*;
71
72// 2006
73const DOC_RELATIONSHIP_TYPE: &str =
74	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
75const CUSTOM_PROPERTIES_TYPE: &str =
76	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties";
77const STYLE_RELATIONSHIP_TYPE: &str =
78	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
79const NUMBERING_RELATIONSHIP_TYPE: &str =
80	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering";
81const SETTINGS_TYPE: &str =
82	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings";
83const WEB_SETTINGS_TYPE: &str =
84	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings";
85const HEADER_TYPE: &str =
86	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/header";
87const FOOTER_TYPE: &str =
88	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer";
89const THEME_TYPE: &str =
90	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme";
91const IMAGE_TYPE: &str =
92	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
93const HYPERLINK_TYPE: &str =
94	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
95
96fn read_headers(
97	rels: &ReadDocumentRels,
98	archive: &mut ZipArchive<Cursor<&[u8]>>,
99) -> HashMap<RId, Header> {
100	let header_paths = rels.find_target_path(HEADER_TYPE);
101	let headers: HashMap<RId, Header> = header_paths
102		.unwrap_or_default()
103		.into_iter()
104		.filter_map(|(rid, path, ..)| {
105			let data = read_zip(archive, path.to_str().expect("should have header path."));
106			if let Ok(d) = data {
107				if let Ok(h) = Header::from_xml(&d[..]) {
108					return Some((rid, h));
109				}
110			}
111			None
112		})
113		.collect();
114	headers
115}
116
117fn read_footers(
118	rels: &ReadDocumentRels,
119	archive: &mut ZipArchive<Cursor<&[u8]>>,
120) -> HashMap<RId, Footer> {
121	let footer_paths = rels.find_target_path(FOOTER_TYPE);
122	let footers: HashMap<RId, Footer> = footer_paths
123		.unwrap_or_default()
124		.into_iter()
125		.filter_map(|(rid, path, ..)| {
126			let data = read_zip(archive, path.to_str().expect("should have footer path."));
127			if let Ok(d) = data {
128				if let Ok(h) = Footer::from_xml(&d[..]) {
129					return Some((rid, h));
130				}
131			}
132			None
133		})
134		.collect();
135	footers
136}
137
138fn read_themes(rels: &ReadDocumentRels, archive: &mut ZipArchive<Cursor<&[u8]>>) -> Vec<Theme> {
139	let theme_paths = rels.find_target_path(THEME_TYPE);
140	theme_paths
141		.unwrap_or_default()
142		.into_iter()
143		.filter_map(|(_rid, path, ..)| {
144			let data = read_zip(archive, path.to_str().expect("should have footer path."));
145			if let Ok(d) = data {
146				if let Ok(h) = Theme::from_xml(&d[..]) {
147					return Some(h);
148				}
149			}
150			None
151		})
152		.collect()
153}
154
155pub fn read_docx(buf: &[u8]) -> Result<Docx, ReaderError> {
156	let mut docx = Docx::new();
157	let cur = Cursor::new(buf);
158	let mut archive = zip::ZipArchive::new(cur)?;
159	// First, the content type for relationship parts and the Main Document part
160	// (the only required part) must be defined (physically located at /[Content_Types].xml in the package)
161	let _content_types = {
162		let data = read_zip(&mut archive, "[Content_Types].xml")?;
163		ContentTypes::from_xml(&data[..])?
164	};
165
166	// Next, the single required relationship (the package-level relationship to the Main Document part)
167	//  must be defined (physically located at /_rels/.rels in the package)
168	let rels = {
169		let data = read_zip(&mut archive, "_rels/.rels")?;
170		Rels::from_xml(&data[..])?
171	};
172
173	// Finally, the minimum content for the Main Document part must be defined
174	// (physically located at /document.xml in the package):
175	let main_rel = rels
176		.find_target(DOC_RELATIONSHIP_TYPE)
177		.ok_or(ReaderError::DocumentNotFoundError);
178
179	let document_path = if let Ok(rel) = main_rel {
180		rel.2.clone()
181	} else {
182		"word/document.xml".to_owned()
183	};
184
185	if let Some(custom_props) = rels.find_target(CUSTOM_PROPERTIES_TYPE) {
186		let data = read_zip(&mut archive, &custom_props.2);
187		if let Ok(data) = data {
188			if let Ok(custom) = CustomProps::from_xml(&data[..]) {
189				docx.doc_props.custom = custom;
190			}
191		}
192	}
193
194	let rels = read_document_rels(&mut archive, &document_path)?;
195
196	let headers = read_headers(&rels, &mut archive);
197	let footers = read_footers(&rels, &mut archive);
198
199	docx.themes = read_themes(&rels, &mut archive);
200
201	// assign headers
202	if let Some(h) = docx.document.section_property.header_reference.clone() {
203		if let Some(header) = headers.get(&h.id) {
204			docx.document = docx.document.header(header.clone(), &h.id);
205			let count = docx.document_rels.header_count + 1;
206			docx.document_rels.header_count = count;
207			docx.content_type = docx.content_type.add_header();
208		}
209	}
210	if let Some(ref h) = docx
211		.document
212		.section_property
213		.first_header_reference
214		.clone()
215	{
216		if let Some(header) = headers.get(&h.id) {
217			docx.document = docx.document.first_header(header.clone(), &h.id);
218			let count = docx.document_rels.header_count + 1;
219			docx.document_rels.header_count = count;
220			docx.content_type = docx.content_type.add_header();
221		}
222	}
223	if let Some(ref h) = docx.document.section_property.even_header_reference.clone() {
224		if let Some(header) = headers.get(&h.id) {
225			docx.document = docx.document.even_header(header.clone(), &h.id);
226			let count = docx.document_rels.header_count + 1;
227			docx.document_rels.header_count = count;
228			docx.content_type = docx.content_type.add_header();
229		}
230	}
231
232	// assign footers
233	if let Some(f) = docx.document.section_property.footer_reference.clone() {
234		if let Some(footer) = footers.get(&f.id) {
235			docx.document = docx.document.footer(footer.clone(), &f.id);
236			let count = docx.document_rels.footer_count + 1;
237			docx.document_rels.footer_count = count;
238			docx.content_type = docx.content_type.add_footer();
239		}
240	}
241
242	if let Some(ref f) = docx
243		.document
244		.section_property
245		.first_footer_reference
246		.clone()
247	{
248		if let Some(footer) = footers.get(&f.id) {
249			docx.document = docx.document.first_footer(footer.clone(), &f.id);
250			let count = docx.document_rels.footer_count + 1;
251			docx.document_rels.footer_count = count;
252			docx.content_type = docx.content_type.add_footer();
253		}
254	}
255	if let Some(ref f) = docx.document.section_property.even_footer_reference.clone() {
256		if let Some(footer) = footers.get(&f.id) {
257			docx.document = docx.document.even_footer(footer.clone(), &f.id);
258			let count = docx.document_rels.footer_count + 1;
259			docx.document_rels.footer_count = count;
260			docx.content_type = docx.content_type.add_footer();
261		}
262	}
263
264	// Read document relationships
265	// Read styles
266	let style_path = rels.find_target_path(STYLE_RELATIONSHIP_TYPE);
267	if let Some(paths) = style_path {
268		if let Some((_, style_path, ..)) = paths.get(0) {
269			let data = read_zip(
270				&mut archive,
271				style_path.to_str().expect("should have styles"),
272			)?;
273			let styles = Styles::from_xml(&data[..])?;
274			docx = docx.styles(styles);
275		}
276	}
277
278	// Read numberings
279	let num_path = rels.find_target_path(NUMBERING_RELATIONSHIP_TYPE);
280	if let Some(paths) = num_path {
281		if let Some((_, num_path, ..)) = paths.get(0) {
282			let data = read_zip(
283				&mut archive,
284				num_path.to_str().expect("should have numberings"),
285			)?;
286			let nums = Numberings::from_xml(&data[..])?;
287			docx = docx.numberings(nums);
288		}
289	}
290
291	// Read settings
292	let settings_path = rels.find_target_path(SETTINGS_TYPE);
293	if let Some(paths) = settings_path {
294		if let Some((_, settings_path, ..)) = paths.get(0) {
295			let data = read_zip(
296				&mut archive,
297				settings_path.to_str().expect("should have settings"),
298			)?;
299			let settings = Settings::from_xml(&data[..])?;
300			docx = docx.settings(settings);
301		}
302	}
303
304	// Read web settings
305	let web_settings_path = rels.find_target_path(WEB_SETTINGS_TYPE);
306	if let Some(paths) = web_settings_path {
307		if let Some((_, web_settings_path, ..)) = paths.get(0) {
308			let data = read_zip(
309				&mut archive,
310				web_settings_path
311					.to_str()
312					.expect("should have web settings"),
313			)?;
314			let web_settings = WebSettings::from_xml(&data[..])?;
315			docx = docx.web_settings(web_settings);
316		}
317	}
318	// Read media
319	let media = rels.find_target_path(IMAGE_TYPE);
320	if let Some(paths) = media {
321		for (id, media, ..) in paths {
322			if let Ok(data) = read_zip(&mut archive, media.to_str().expect("should have media")) {
323				docx = docx.add_image(id, media.to_str().unwrap().to_string(), data);
324			}
325		}
326	}
327
328	// Read hyperlinks
329	let links = rels.find_target_path(HYPERLINK_TYPE);
330	if let Some(paths) = links {
331		for (id, target, mode) in paths {
332			if let Some(mode) = mode {
333				docx =
334					docx.add_hyperlink(id, target.to_str().expect("should convert to str"), mode);
335			}
336		}
337	}
338
339	Ok(docx)
340}