docx_rs/reader/
read_xml.rs

1use super::header_or_footer_rels::ReadHeaderOrFooterRels;
2use super::namespace::*;
3use super::*;
4
5fn read_headers_from_xml(
6    rels: &Rels,
7    part_map: &HashMap<String, String>,
8    document_path: &str,
9) -> HashMap<RId, (Header, ReadHeaderOrFooterRels)> {
10    let mut headers = HashMap::new();
11
12    // Find all header relationships by looking for header types in rels
13    for (rel_type, id, target) in &rels.rels {
14        if rel_type == HEADER_TYPE {
15            let header_path = format!("{}/{}", document_path.replace("document.xml", ""), target);
16
17            if let Some(header_data) = part_map.get(&header_path) {
18                if let Ok(header) = Header::from_xml(header_data.as_bytes()) {
19                    // For simplicity, use default ReadHeaderOrFooterRels
20                    // In a full implementation, we would read the header's _rels file
21                    let header_rels = ReadHeaderOrFooterRels::default();
22                    headers.insert(id.clone(), (header, header_rels));
23                }
24            }
25        }
26    }
27
28    headers
29}
30
31fn read_footers_from_xml(
32    rels: &Rels,
33    part_map: &HashMap<String, String>,
34    document_path: &str,
35) -> HashMap<RId, (Footer, ReadHeaderOrFooterRels)> {
36    let mut footers = HashMap::new();
37
38    // Find all footer relationships by looking for footer types in rels
39    for (rel_type, id, target) in &rels.rels {
40        if rel_type == FOOTER_TYPE {
41            let footer_path = format!("{}/{}", document_path.replace("document.xml", ""), target);
42
43            if let Some(footer_data) = part_map.get(&footer_path) {
44                if let Ok(footer) = Footer::from_xml(footer_data.as_bytes()) {
45                    // For simplicity, use default ReadHeaderOrFooterRels
46                    // In a full implementation, we would read the footer's _rels file
47                    let footer_rels = ReadHeaderOrFooterRels::default();
48                    footers.insert(id.clone(), (footer, footer_rels));
49                }
50            }
51        }
52    }
53
54    footers
55}
56
57fn read_comments_from_xml(
58    rels: &Rels,
59    part_map: &HashMap<String, String>,
60    document_path: &str,
61) -> (Comments, CommentsExtended) {
62    // Simplified implementation for XML packages - try to read comments if available
63    let comments_extended = if let Some((_, _, target)) = rels.find_target(COMMENTS_EXTENDED_TYPE) {
64        let ext_path = format!("{}/{}", document_path.replace("document.xml", ""), target);
65        if let Some(comments_ext_data) = part_map.get(&ext_path) {
66            CommentsExtended::from_xml(comments_ext_data.as_bytes()).unwrap_or_default()
67        } else {
68            CommentsExtended::default()
69        }
70    } else {
71        CommentsExtended::default()
72    };
73
74    // Read comments
75    let comments = if let Some((_, _, target)) = rels.find_target(COMMENTS_TYPE) {
76        let comm_path = format!("{}/{}", document_path.replace("document.xml", ""), target);
77        if let Some(comments_data) = part_map.get(&comm_path) {
78            if let Ok(comments) = Comments::from_xml(comments_data.as_bytes()) {
79                // Process extended comments relationships
80                let mut comments_inner = comments.into_inner();
81                for i in 0..comments_inner.len() {
82                    let c = &comments_inner[i];
83                    let extended = comments_extended.children.iter().find(|ex| {
84                        for child in &c.children {
85                            if let CommentChild::Paragraph(p) = child {
86                                if ex.paragraph_id == p.id {
87                                    return true;
88                                }
89                            }
90                        }
91                        false
92                    });
93                    if let Some(CommentExtended {
94                        parent_paragraph_id: Some(parent_paragraph_id),
95                        ..
96                    }) = extended
97                    {
98                        if let Some(parent_comment) = comments_inner.iter().find(|c| {
99                            for child in &c.children {
100                                if let CommentChild::Paragraph(p) = child {
101                                    if &p.id == parent_paragraph_id {
102                                        return true;
103                                    }
104                                }
105                            }
106                            false
107                        }) {
108                            comments_inner[i].parent_comment_id = Some(parent_comment.id);
109                        }
110                    }
111                }
112                Comments {
113                    comments: comments_inner,
114                }
115            } else {
116                Comments::default()
117            }
118        } else {
119            Comments::default()
120        }
121    } else {
122        Comments::default()
123    };
124
125    (comments, comments_extended)
126}
127
128fn add_images_from_xml(
129    mut docx: Docx,
130    media: Option<Vec<(RId, PathBuf, Option<String>)>>,
131    part_map: &HashMap<String, String>,
132    document_path: &str,
133) -> Docx {
134    // Read media from XML package
135    if let Some(paths) = media {
136        for (id, media_path, ..) in paths {
137            let base_path = document_path
138                .replace("document.xml", "")
139                .trim_end_matches('/')
140                .to_string();
141            let image_path = format!("{}/{}", base_path, media_path.to_str().unwrap_or(""));
142
143            // Try multiple possible paths for the image
144            let paths_to_try = vec![
145                image_path.clone(),
146                format!("/{}", image_path.trim_start_matches('/')),
147                image_path.trim_start_matches('/').to_string(),
148            ];
149
150            for path in paths_to_try {
151                if let Some(image_data) = part_map.get(&path) {
152                    // For XML packages with binaryData, the data is base64 encoded
153                    // Remove all whitespace and newlines from base64 data
154                    let clean_base64 = image_data
155                        .chars()
156                        .filter(|c| !c.is_whitespace())
157                        .collect::<String>();
158                    let bytes = if let Ok(decoded) =
159                        base64::engine::general_purpose::STANDARD.decode(&clean_base64)
160                    {
161                        decoded
162                    } else {
163                        // If base64 decode fails, try as raw bytes (shouldn't happen for binaryData)
164                        image_data.as_bytes().to_vec()
165                    };
166                    docx =
167                        docx.add_image(id.clone(), media_path.to_str().unwrap().to_string(), bytes);
168                    break;
169                }
170            }
171        }
172    }
173    docx
174}
175
176fn add_header_footer_images(
177    docx: Docx,
178    _header_footer_rels: &ReadHeaderOrFooterRels,
179    _part_map: &HashMap<String, String>,
180    _document_path: &str,
181) -> Docx {
182    // For XML packages, header/footer image handling is simplified
183    // In a full implementation, we would process header/footer relationships for images
184    // For now, we'll use the default ReadHeaderOrFooterRels which doesn't have images
185    docx
186}
187
188/// A struct to hold information about a part in an XML package
189#[derive(Debug, Clone)]
190pub struct XmlPackagePart {
191    pub name: String,
192    pub _content_type: String,
193    pub data: String,
194}
195
196/// Decode HTML entities and fix double-quote issues in Word XML
197fn decode_html_entities(text: &str) -> String {
198    // The main issue: Word XML uses double quotes incorrectly - fix them first
199    let text = text.replace("\"\"", "\"");
200
201    // Then handle standard HTML entities
202    text.replace("&quot;", "\"")
203        .replace("&amp;", "&")
204        .replace("&lt;", "<")
205        .replace("&gt;", ">")
206        .replace("&apos;", "'")
207        .replace("&#39;", "'")
208        .replace("&#34;", "\"")
209        .replace("&#38;", "&")
210        .replace("&#60;", "<")
211        .replace("&#62;", ">")
212}
213
214/// Extract parts from a Microsoft Word XML package using simple string parsing
215pub fn extract_xml_package_parts(xml_content: &str) -> Result<Vec<XmlPackagePart>, ReaderError> {
216    let mut parts = Vec::new();
217
218    // Find all pkg:part elements
219    let mut start_idx = 0;
220    while let Some(part_start) = xml_content[start_idx..].find("<pkg:part") {
221        let part_start = start_idx + part_start;
222
223        // Find the end of this part
224        if let Some(part_end) = xml_content[part_start..].find("</pkg:part>") {
225            let part_end = part_start + part_end + 11; // 11 = "</pkg:part>".len()
226            let part_xml = &xml_content[part_start..part_end];
227
228            // Extract name attribute (handle both standard and double-quoted patterns)
229            let name = if let Some(name_start) = part_xml.find("pkg:name=\"\"") {
230                // Handle double-quoted pattern: pkg:name=""value""
231                let name_start = name_start + 12; // 12 = "pkg:name=\"\"".len()
232                if let Some(name_end) = part_xml[name_start..].find("\"\"") {
233                    decode_html_entities(&part_xml[name_start..name_start + name_end])
234                } else {
235                    // Skip this part but continue processing
236                    start_idx = part_end;
237                    continue;
238                }
239            } else if let Some(name_start) = part_xml.find("pkg:name=\"") {
240                // Handle standard pattern: pkg:name="value"
241                let name_start = name_start + 10; // 10 = "pkg:name=\"".len()
242                if let Some(name_end) = part_xml[name_start..].find("\"") {
243                    part_xml[name_start..name_start + name_end].to_string()
244                } else {
245                    // Skip this part but continue processing
246                    start_idx = part_end;
247                    continue;
248                }
249            } else {
250                // Skip this part but continue processing
251                start_idx = part_end;
252                continue;
253            };
254
255            // Extract contentType attribute (handle both standard and double-quoted patterns)
256            let content_type = if let Some(type_start) = part_xml.find("pkg:contentType=\"\"") {
257                // Handle double-quoted pattern: pkg:contentType=""value""
258                let type_start = type_start + 19; // 19 = "pkg:contentType=\"\"".len()
259                if let Some(type_end) = part_xml[type_start..].find("\"\"") {
260                    decode_html_entities(&part_xml[type_start..type_start + type_end])
261                } else {
262                    // Skip this part but continue processing
263                    start_idx = part_end;
264                    continue;
265                }
266            } else if let Some(type_start) = part_xml.find("pkg:contentType=\"") {
267                // Handle standard pattern: pkg:contentType="value"
268                let type_start = type_start + 17; // 17 = "pkg:contentType=\"".len()
269                if let Some(type_end) = part_xml[type_start..].find("\"") {
270                    part_xml[type_start..type_start + type_end].to_string()
271                } else {
272                    // Skip this part but continue processing
273                    start_idx = part_end;
274                    continue;
275                }
276            } else {
277                // Skip this part but continue processing
278                start_idx = part_end;
279                continue;
280            };
281
282            // Extract xmlData or binaryData content
283            let data = if let Some(data_start) = part_xml.find("<pkg:xmlData>") {
284                let data_start = data_start + 13; // 13 = "<pkg:xmlData>".len()
285                if let Some(data_end) = part_xml[data_start..].find("</pkg:xmlData>") {
286                    part_xml[data_start..data_start + data_end].to_string()
287                } else {
288                    // Skip this part but continue processing
289                    start_idx = part_end;
290                    continue;
291                }
292            } else if let Some(data_start) = part_xml.find("<pkg:binaryData>") {
293                let data_start = data_start + 16; // 16 = "<pkg:binaryData>".len()
294                if let Some(data_end) = part_xml[data_start..].find("</pkg:binaryData>") {
295                    // Binary data in XML packages is base64 encoded
296                    part_xml[data_start..data_start + data_end]
297                        .trim()
298                        .to_string()
299                } else {
300                    // Skip this part but continue processing
301                    start_idx = part_end;
302                    continue;
303                }
304            } else {
305                // Skip this part but continue processing
306                start_idx = part_end;
307                continue;
308            };
309
310            parts.push(XmlPackagePart {
311                name,
312                _content_type: content_type,
313                data,
314            });
315
316            start_idx = part_end;
317        } else {
318            // No closing tag found, move to next position to avoid infinite loop
319            start_idx = part_start + 1;
320        }
321    }
322
323    Ok(parts)
324}
325
326/// Read a Docx from Microsoft Word XML format (single XML file with all parts)
327pub fn read_xml(xml_content: &str) -> Result<Docx, ReaderError> {
328    let mut docx = Docx::new();
329
330    // Extract all parts from the XML package
331    let parts = extract_xml_package_parts(xml_content)?;
332
333    // Create a HashMap for easy lookup of parts by name
334    let mut part_map: HashMap<String, String> = HashMap::new();
335    for part in parts {
336        part_map.insert(part.name, part.data);
337    }
338
339    // Read content types (not strictly necessary for XML format, but helps with compatibility)
340    let _content_types = if let Some(content_types_data) = part_map.get("[Content_Types].xml") {
341        ContentTypes::from_xml(content_types_data.as_bytes()).ok()
342    } else {
343        None
344    };
345
346    // Read main relationships (try both with and without leading slash)
347    let rels = if let Some(rels_data) = part_map.get("_rels/.rels") {
348        Rels::from_xml(rels_data.as_bytes())?
349    } else if let Some(rels_data) = part_map.get("/_rels/.rels") {
350        Rels::from_xml(rels_data.as_bytes())?
351    } else {
352        return Err(ReaderError::DocumentNotFoundError);
353    };
354
355    // Find the main document path
356    let main_rel = rels
357        .find_target(DOC_RELATIONSHIP_TYPE)
358        .ok_or(ReaderError::DocumentNotFoundError);
359
360    let document_path = if let Ok(rel) = main_rel {
361        rel.2.clone()
362    } else {
363        "word/document.xml".to_owned()
364    };
365
366    // Read custom properties if available
367    if let Some(custom_props) = rels.find_target(CUSTOM_PROPERTIES_TYPE) {
368        if let Some(data) = part_map.get(&custom_props.2) {
369            if let Ok(custom) = CustomProps::from_xml(data.as_bytes()) {
370                docx.doc_props.custom = custom;
371            }
372        }
373    }
374
375    // Read document relationships - use Rels directly for XML packages
376    let document_rels_path = document_path.replace("document.xml", "_rels/document.xml.rels");
377    let document_rels = if let Some(rels_data) = part_map.get(&document_rels_path) {
378        Rels::from_xml(rels_data.as_bytes())?
379    } else if let Some(rels_data) = part_map.get(&format!("/{}", document_rels_path)) {
380        Rels::from_xml(rels_data.as_bytes())?
381    } else {
382        Rels::default()
383    };
384
385    // Read themes (improved implementation with debug)
386    if let Some(theme_rel) = document_rels.find_target(THEME_TYPE) {
387        let base_path = document_path
388            .replace("document.xml", "")
389            .trim_end_matches('/')
390            .to_string();
391        let theme_path_str = format!("{}/{}", base_path, theme_rel.2);
392
393        // Also try the direct path from XML package
394        let direct_theme_path = if theme_path_str.starts_with("/") {
395            theme_path_str.clone()
396        } else {
397            format!("/{}", theme_path_str)
398        };
399
400        if let Some(theme_data) = part_map
401            .get(&theme_path_str)
402            .or_else(|| part_map.get(&direct_theme_path))
403        {
404            match Theme::from_xml(theme_data.as_bytes()) {
405                Ok(theme) => {
406                    docx.themes.push(theme);
407                }
408                Err(_) => {
409                    // Theme parsing failed - this is a known limitation for some XML formats
410                }
411            }
412        }
413    }
414
415    // Read headers and footers
416    let headers = read_headers_from_xml(&document_rels, &part_map, &document_path);
417    let footers = read_footers_from_xml(&document_rels, &part_map, &document_path);
418
419    // Read comments and comments extended
420    let (comments, comments_extended) =
421        read_comments_from_xml(&document_rels, &part_map, &document_path);
422
423    // Read the main document (try both with and without leading slash)
424    let document = if let Some(doc_data) = part_map.get(&document_path) {
425        Document::from_xml(doc_data.as_bytes())?
426    } else if let Some(doc_data) = part_map.get(&format!("/{}", document_path)) {
427        Document::from_xml(doc_data.as_bytes())?
428    } else if let Some(stripped_path) = document_path.strip_prefix("/") {
429        if let Some(doc_data) = part_map.get(stripped_path) {
430            Document::from_xml(doc_data.as_bytes())?
431        } else {
432            return Err(ReaderError::DocumentNotFoundError);
433        }
434    } else {
435        return Err(ReaderError::DocumentNotFoundError);
436    };
437
438    docx = docx.document(document);
439
440    // Read styles if available (improved implementation with debug)
441    if let Some(styles_rel) = document_rels.find_target(STYLE_RELATIONSHIP_TYPE) {
442        let base_path = document_path
443            .replace("document.xml", "")
444            .trim_end_matches('/')
445            .to_string();
446        let styles_path_str = format!("{}/{}", base_path, styles_rel.2);
447
448        // Also try the direct path from XML package
449        let direct_styles_path = if styles_path_str.starts_with("/") {
450            styles_path_str.clone()
451        } else {
452            format!("/{}", styles_path_str)
453        };
454
455        if let Some(styles_data) = part_map
456            .get(&styles_path_str)
457            .or_else(|| part_map.get(&direct_styles_path))
458        {
459            if let Ok(styles) = Styles::from_xml(styles_data.as_bytes()) {
460                docx.styles = styles;
461            }
462        }
463    }
464
465    // Read numbering if available (improved implementation with debug)
466    if let Some(numbering_rel) = document_rels.find_target(NUMBERING_RELATIONSHIP_TYPE) {
467        let base_path = document_path
468            .replace("document.xml", "")
469            .trim_end_matches('/')
470            .to_string();
471        let numbering_path_str = format!("{}/{}", base_path, numbering_rel.2);
472
473        // Also try the direct path from XML package
474        let direct_numbering_path = if numbering_path_str.starts_with("/") {
475            numbering_path_str.clone()
476        } else {
477            format!("/{}", numbering_path_str)
478        };
479
480        if let Some(numbering_data) = part_map
481            .get(&numbering_path_str)
482            .or_else(|| part_map.get(&direct_numbering_path))
483        {
484            if let Ok(numberings) = Numberings::from_xml(numbering_data.as_bytes()) {
485                docx.numberings = numberings;
486            }
487        }
488    } else {
489        // Try to find numbering.xml directly in the package if no relationship is found
490        let direct_paths = ["/word/numbering.xml", "word/numbering.xml"];
491        for path in &direct_paths {
492            if let Some(numbering_data) = part_map.get(*path) {
493                if let Ok(numberings) = Numberings::from_xml(numbering_data.as_bytes()) {
494                    docx.numberings = numberings;
495                    break;
496                }
497            }
498        }
499    }
500
501    // Read settings if available (basic implementation)
502    if let Some(settings_rel) = document_rels.find_target(SETTINGS_TYPE) {
503        let base_path = document_path
504            .replace("document.xml", "")
505            .trim_end_matches('/')
506            .to_string();
507        let settings_path_str = format!("{}/{}", base_path, settings_rel.2);
508        if let Some(settings_data) = part_map.get(&settings_path_str) {
509            if let Ok(settings) = Settings::from_xml(settings_data.as_bytes()) {
510                docx.settings = settings;
511            }
512        }
513    }
514
515    // Read web settings if available (basic implementation)
516    if let Some(web_settings_rel) = document_rels.find_target(WEB_SETTINGS_TYPE) {
517        let base_path = document_path
518            .replace("document.xml", "")
519            .trim_end_matches('/')
520            .to_string();
521        let web_settings_path_str = format!("{}/{}", base_path, web_settings_rel.2);
522        if let Some(web_settings_data) = part_map.get(&web_settings_path_str) {
523            if let Ok(web_settings) = WebSettings::from_xml(web_settings_data.as_bytes()) {
524                docx.web_settings = web_settings;
525            }
526        }
527    }
528
529    // Assign headers
530    if let Some(h) = docx.document.section_property.header_reference.clone() {
531        if let Some((header, header_rels)) = headers.get(&h.id) {
532            docx.document = docx.document.header(header.clone(), &h.id);
533            let count = docx.document_rels.header_count + 1;
534            docx.document_rels.header_count = count;
535            docx.content_type = docx.content_type.add_header();
536
537            // Read media from header if available
538            docx = add_header_footer_images(docx, header_rels, &part_map, &document_path);
539        }
540    }
541    if let Some(ref h) = docx
542        .document
543        .section_property
544        .first_header_reference
545        .clone()
546    {
547        if let Some((header, header_rels)) = headers.get(&h.id) {
548            docx.document = docx
549                .document
550                .first_header_without_title_pg(header.clone(), &h.id);
551            let count = docx.document_rels.header_count + 1;
552            docx.document_rels.header_count = count;
553            docx.content_type = docx.content_type.add_header();
554
555            // Read media from header if available
556            docx = add_header_footer_images(docx, header_rels, &part_map, &document_path);
557        }
558    }
559    if let Some(ref h) = docx.document.section_property.even_header_reference.clone() {
560        if let Some((header, header_rels)) = headers.get(&h.id) {
561            docx.document = docx.document.even_header(header.clone(), &h.id);
562            let count = docx.document_rels.header_count + 1;
563            docx.document_rels.header_count = count;
564            docx.content_type = docx.content_type.add_header();
565
566            // Read media from header if available
567            docx = add_header_footer_images(docx, header_rels, &part_map, &document_path);
568        }
569    }
570
571    // Assign footers
572    if let Some(f) = docx.document.section_property.footer_reference.clone() {
573        if let Some((footer, footer_rels)) = footers.get(&f.id) {
574            docx.document = docx.document.footer(footer.clone(), &f.id);
575            let count = docx.document_rels.footer_count + 1;
576            docx.document_rels.footer_count = count;
577            docx.content_type = docx.content_type.add_footer();
578
579            // Read media from footer if available
580            docx = add_header_footer_images(docx, footer_rels, &part_map, &document_path);
581        }
582    }
583    if let Some(ref f) = docx
584        .document
585        .section_property
586        .first_footer_reference
587        .clone()
588    {
589        if let Some((footer, footer_rels)) = footers.get(&f.id) {
590            docx.document = docx
591                .document
592                .first_footer_without_title_pg(footer.clone(), &f.id);
593            let count = docx.document_rels.footer_count + 1;
594            docx.document_rels.footer_count = count;
595            docx.content_type = docx.content_type.add_footer();
596
597            // Read media from footer if available
598            docx = add_header_footer_images(docx, footer_rels, &part_map, &document_path);
599        }
600    }
601    if let Some(ref f) = docx.document.section_property.even_footer_reference.clone() {
602        if let Some((footer, footer_rels)) = footers.get(&f.id) {
603            docx.document = docx.document.even_footer(footer.clone(), &f.id);
604            let count = docx.document_rels.footer_count + 1;
605            docx.document_rels.footer_count = count;
606            docx.content_type = docx.content_type.add_footer();
607
608            // Read media from footer if available
609            docx = add_header_footer_images(docx, footer_rels, &part_map, &document_path);
610        }
611    }
612
613    // Store comments to paragraphs
614    if !comments.inner().is_empty() {
615        docx.store_comments(comments.inner());
616        docx = docx.comments(comments);
617        docx = docx.comments_extended(comments_extended);
618    }
619
620    // Read and add images from XML package
621    let media = document_rels
622        .rels
623        .iter()
624        .filter(|(rel_type, ..)| *rel_type == IMAGE_TYPE)
625        .map(|(_, id, target)| (id.clone(), PathBuf::from(target), None))
626        .collect::<Vec<_>>();
627
628    if !media.is_empty() {
629        docx = add_images_from_xml(docx, Some(media), &part_map, &document_path);
630    }
631
632    // Read and add hyperlinks - simplified for XML packages
633    if let Some((id, _, target)) = document_rels.find_target(HYPERLINK_TYPE) {
634        // For XML packages, we'll add the hyperlink with a default mode
635        docx = docx.add_hyperlink(id, target, "External".to_string());
636    }
637
638    Ok(docx)
639}
640
641#[cfg(test)]
642mod tests {
643    use super::*;
644
645    // #[test]
646    // fn test_read_xml_with_numbering() {
647    //     let xml_content = include_str!("../../../1.xml");
648    //     let docx = read_xml(xml_content).unwrap();
649    //     
650    //     println!("Abstract nums count: {}", docx.numberings.abstract_nums.len());
651    //     println!("Numberings count: {}", docx.numberings.numberings.len());
652    //     
653    //     // 最低限のnumberingが読み取られていることを確認
654    //     assert!(docx.numberings.abstract_nums.len() > 0, "Abstract numberings should be read");
655    //     assert!(docx.numberings.numberings.len() > 0, "Numberings should be read");
656    // }
657
658    #[test]
659    fn test_read_xml_basic() {
660        // A minimal XML package with just the basic structure
661        let xml_content = r#"<?xml version="1.0" standalone="yes"?>
662<pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage">
663    <pkg:part pkg:name="/_rels/.rels" pkg:contentType="application/vnd.openxmlformats-package.relationships+xml">
664        <pkg:xmlData>
665            <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
666                <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
667            </Relationships>
668        </pkg:xmlData>
669    </pkg:part>
670    <pkg:part pkg:name="/word/document.xml" pkg:contentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml">
671        <pkg:xmlData>
672            <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
673                <w:body>
674                    <w:p>
675                        <w:r>
676                            <w:t>Hello, World!</w:t>
677                        </w:r>
678                    </w:p>
679                </w:body>
680            </w:document>
681        </pkg:xmlData>
682    </pkg:part>
683</pkg:package>"#;
684
685        let result = read_xml(xml_content);
686        assert!(result.is_ok(), "Failed to parse XML: {:?}", result.err());
687
688        let docx = result.unwrap();
689        assert!(
690            !docx.document.children.is_empty(),
691            "Document should contain some content"
692        );
693    }
694
695    #[test]
696    fn test_extract_xml_package_parts() {
697        let xml_content = r#"<pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage">
698    <pkg:part pkg:name="/_rels/.rels" pkg:contentType="application/vnd.openxmlformats-package.relationships+xml">
699        <pkg:xmlData>
700            <test>content</test>
701        </pkg:xmlData>
702    </pkg:part>
703</pkg:package>"#;
704
705        let result = extract_xml_package_parts(xml_content);
706        assert!(result.is_ok());
707
708        let parts = result.unwrap();
709        assert_eq!(parts.len(), 1);
710        assert_eq!(parts[0].name, "/_rels/.rels");
711        assert!(parts[0].data.contains("<test>content</test>"));
712    }
713
714    #[test]
715    fn test_read_xml_with_image() {
716        let xml_content = r#"<?xml version="1.0" standalone="yes"?>
717<pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage">
718  <pkg:part pkg:name="/_rels/.rels" pkg:contentType="application/vnd.openxmlformats-package.relationships+xml">
719    <pkg:xmlData>
720      <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
721        <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
722      </Relationships>
723    </pkg:xmlData>
724  </pkg:part>
725  <pkg:part pkg:name="/word/document.xml" pkg:contentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml">
726    <pkg:xmlData>
727      <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
728        <w:body>
729          <w:p>
730            <w:r>
731              <w:drawing>
732                <wp:inline distT="0" distB="0" distL="0" distR="0">
733                  <wp:extent cx="1000000" cy="1000000"/>
734                  <wp:docPr id="1" name="Picture 1"/>
735                  <wp:cNvGraphicFramePr>
736                    <a:graphicFrameLocks noChangeAspect="1"/>
737                  </wp:cNvGraphicFramePr>
738                  <a:graphic>
739                    <a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">
740                      <pic:pic>
741                        <pic:nvPicPr>
742                          <pic:cNvPr id="1" name="test.png"/>
743                          <pic:cNvPicPr/>
744                        </pic:nvPicPr>
745                        <pic:blipFill>
746                          <a:blip r:embed="rId1"/>
747                          <a:stretch>
748                            <a:fillRect/>
749                          </a:stretch>
750                        </pic:blipFill>
751                        <pic:spPr>
752                          <a:xfrm>
753                            <a:off x="0" y="0"/>
754                            <a:ext cx="1000000" cy="1000000"/>
755                          </a:xfrm>
756                          <a:prstGeom prst="rect">
757                            <a:avLst/>
758                          </a:prstGeom>
759                        </pic:spPr>
760                      </pic:pic>
761                    </a:graphicData>
762                  </a:graphic>
763                </wp:inline>
764              </w:drawing>
765            </w:r>
766          </w:p>
767        </w:body>
768      </w:document>
769    </pkg:xmlData>
770  </pkg:part>
771  <pkg:part pkg:name="/word/_rels/document.xml.rels" pkg:contentType="application/vnd.openxmlformats-package.relationships+xml">
772    <pkg:xmlData>
773      <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
774        <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/>
775      </Relationships>
776    </pkg:xmlData>
777  </pkg:part>
778  <pkg:part pkg:name="/word/media/image1.png" pkg:contentType="image/png">
779    <pkg:binaryData>iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==</pkg:binaryData>
780  </pkg:part>
781</pkg:package>"#;
782
783        let result = read_xml(xml_content);
784        assert!(
785            result.is_ok(),
786            "Failed to parse XML with image: {:?}",
787            result.err()
788        );
789
790        let docx = result.unwrap();
791        assert!(
792            !docx.document.children.is_empty(),
793            "Document should contain some content"
794        );
795
796        // Check that the image was loaded
797        assert!(!docx.images.is_empty(), "Document should contain images");
798        assert_eq!(
799            docx.images.len(),
800            1,
801            "Document should contain exactly one image"
802        );
803
804        // Verify the image data was decoded correctly (this is a 1x1 pixel PNG)
805        let image = &docx.images[0];
806        assert_eq!(image.1, "media/image1.png", "Image path should match");
807        assert!(!image.2 .0.is_empty(), "Image data should not be empty");
808
809        // Verify it's proper PNG data (starts with PNG magic bytes)
810        assert_eq!(
811            &image.2 .0[0..8],
812            &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A],
813            "Should be valid PNG data"
814        );
815    }
816}
docx_rs/reader/read_xml.rs

docx_rs/reader/
read_xml.rs