orbok_extract/
docx.rs

1//! DOCX text extractor (Microsoft Word 2007+).
2//!
3//! DOCX files are ZIP archives containing XML. This extractor reads
4//! `word/document.xml` and strips XML tags to recover paragraph text.
5//! Location quality is `Approximate` — DOCX does not provide byte
6//! offsets; only paragraph order is preserved.
7//!
8//! Security: the file is opened with `zip::ZipArchive` which bounds
9//! reads to the archive contents. No external entity expansion.
10
11use crate::normalize::normalize_document;
12use crate::types::{
13    DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
14};
15use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
16use orbok_fs::ValidatedPath;
17use std::io::Read;
18
19const EXTRACTOR_NAME: &str = "docx";
20const EXTRACTOR_VERSION: &str = "v1";
21
22pub struct DocxExtractor;
23
24impl DocumentExtractor for DocxExtractor {
25    fn name(&self) -> &'static str {
26        EXTRACTOR_NAME
27    }
28    fn version(&self) -> &'static str {
29        EXTRACTOR_VERSION
30    }
31    fn supported_extensions(&self) -> &'static [&'static str] {
32        &["docx"]
33    }
34
35    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
36        let file = std::fs::File::open(&path.canonical)?;
37        let mut zip = zip::ZipArchive::new(file).map_err(|e| OrbokError::Extraction {
38            category: ErrorCategory::ParserError,
39            message: format!("docx zip: {e}"),
40        })?;
41
42        let xml = match zip.by_name("word/document.xml") {
43            Ok(mut entry) => {
44                let mut s = String::new();
45                entry
46                    .read_to_string(&mut s)
47                    .map_err(|e| OrbokError::Extraction {
48                        category: ErrorCategory::ParserError,
49                        message: format!("docx xml read: {e}"),
50                    })?;
51                s
52            }
53            Err(_) => {
54                return Err(OrbokError::Extraction {
55                    category: ErrorCategory::UnsupportedFormat,
56                    message: "no word/document.xml in archive".into(),
57                });
58            }
59        };
60
61        // Extract text runs from w:p paragraphs.
62        let paragraphs = extract_paragraphs(&xml);
63        let mut segments = Vec::new();
64        let mut total_chars = 0u64;
65
66        for (para_idx, para_text) in paragraphs.iter().enumerate() {
67            let norm = normalize_document(para_text);
68            if norm.trim().is_empty() {
69                continue;
70            }
71            total_chars += norm.len() as u64;
72            segments.push(ExtractedSegment {
73                kind: SegmentKind::Paragraph,
74                text: norm,
75                line_start: (para_idx + 1) as u32,
76                line_end: (para_idx + 1) as u32,
77                heading_path: None,
78                location_quality: LocationQuality::Approximate,
79            });
80        }
81
82        Ok(ExtractOutput {
83            extractor_name: EXTRACTOR_NAME.to_string(),
84            extractor_version: EXTRACTOR_VERSION.to_string(),
85            normalization_version: NORMALIZATION_VERSION.to_string(),
86            segments,
87            char_count: total_chars,
88        })
89    }
90}
91
92/// Extract paragraph text from DOCX word/document.xml by collecting
93/// all `w:t` text runs within each `w:p` paragraph element.
94fn extract_paragraphs(xml: &str) -> Vec<String> {
95    let mut paragraphs = Vec::new();
96    let mut current_para = String::new();
97    let mut in_para = false;
98    let mut pos = 0;
99    let bytes = xml.as_bytes();
100
101    while pos < bytes.len() {
102        if bytes[pos] == b'<' {
103            // Find end of tag
104            let end = bytes[pos..]
105                .iter()
106                .position(|&b| b == b'>')
107                .map(|p| pos + p + 1)
108                .unwrap_or(bytes.len());
109            let tag = &xml[pos..end];
110            if tag.starts_with("<w:p ") || tag == "<w:p>" {
111                in_para = true;
112                current_para.clear();
113            } else if tag.starts_with("</w:p>") {
114                if in_para && !current_para.trim().is_empty() {
115                    paragraphs.push(current_para.trim().to_string());
116                }
117                in_para = false;
118                current_para.clear();
119            } else if in_para && (tag.starts_with("<w:t") || tag.starts_with("<w:t>")) {
120                // Collect text until </w:t>
121                let text_start = end;
122                let text_end = xml[text_start..]
123                    .find("</w:t>")
124                    .map(|p| text_start + p)
125                    .unwrap_or(text_start);
126                let text = &xml[text_start..text_end];
127                // Skip any nested tags in text content
128                let clean: String = {
129                    let mut t = String::new();
130                    let mut in_tag = false;
131                    for c in text.chars() {
132                        match c {
133                            '<' => in_tag = true,
134                            '>' => in_tag = false,
135                            _ if !in_tag => t.push(c),
136                            _ => {}
137                        }
138                    }
139                    t
140                };
141                current_para.push_str(&clean);
142                pos = text_end;
143                continue;
144            }
145            pos = end;
146        } else {
147            pos += 1;
148        }
149    }
150    paragraphs
151}
orbok_extract/docx.rs

orbok_extract/
docx.rs