Skip to main content

orbok_extract/
docx.rs

1//! DOCX text extractor (Microsoft Word 2007+; RFC-044 §16.4 resource limits).
2//!
3//! DOCX files are ZIP archives containing XML. This extractor reads
4//! `word/document.xml` and strips XML tags to recover paragraph text.
5//! Location quality is `Approximate` — DOCX does not provide byte
6//! offsets; only paragraph order is preserved.
7//!
8//! Security: the file is opened with `zip::ZipArchive` which bounds
9//! reads to the archive contents. No external entity expansion.
10
11use crate::normalize::normalize_document;
12use crate::types::{
13    DocumentExtractor, ExtractContext, ExtractOutput, ExtractWarning, ExtractedSegment,
14    LocationKind, LocationQuality, SegmentKind, read_error_category,
15};
16use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
17use orbok_fs::ValidatedPath;
18use std::io::Read;
19
20const EXTRACTOR_NAME: &str = "docx";
21const EXTRACTOR_VERSION: &str = "v1";
22
23pub struct DocxExtractor;
24
25impl DocumentExtractor for DocxExtractor {
26    fn name(&self) -> &'static str {
27        EXTRACTOR_NAME
28    }
29
30    fn version(&self) -> &'static str {
31        EXTRACTOR_VERSION
32    }
33
34    fn supported_extensions(&self) -> &'static [&'static str] {
35        &["docx"]
36    }
37
38    fn extract_with_context(
39        &self,
40        path: &ValidatedPath,
41        context: &ExtractContext,
42    ) -> OrbokResult<ExtractOutput> {
43        let limits = &context.limits;
44        let mut warnings = Vec::new();
45
46        // RFC-044 §9.5: check file size before opening ZIP.
47        let meta = std::fs::metadata(&path.canonical).map_err(|e| OrbokError::Extraction {
48            category: read_error_category(&e),
49            message: e.to_string(),
50        })?;
51        if meta.len() > limits.max_zip_entry_bytes {
52            return Err(OrbokError::Extraction {
53                category: ErrorCategory::FileTooLarge,
54                message: format!(
55                    "DOCX file is {} bytes, limit is {}",
56                    meta.len(),
57                    limits.max_zip_entry_bytes
58                ),
59            });
60        }
61
62        let file = std::fs::File::open(&path.canonical).map_err(|e| OrbokError::Extraction {
63            category: read_error_category(&e),
64            message: e.to_string(),
65        })?;
66        let mut zip = zip::ZipArchive::new(file).map_err(|e| OrbokError::Extraction {
67            category: ErrorCategory::ParserError,
68            message: format!("docx zip: {e}"),
69        })?;
70
71        // RFC-044 §9.5: enforce per-entry XML size limit.
72        let xml = match zip.by_name("word/document.xml") {
73            Ok(mut entry) => {
74                if entry.size() > limits.max_docx_xml_bytes {
75                    warnings.push(ExtractWarning::SizeLimitReached {
76                        limit_name: "max_docx_xml_bytes".into(),
77                    });
78                    // Read only up to limit.
79                    let mut buf = vec![0u8; limits.max_docx_xml_bytes as usize];
80                    let n = entry.read(&mut buf).map_err(|e| OrbokError::Extraction {
81                        category: ErrorCategory::ParserError,
82                        message: format!("docx xml read: {e}"),
83                    })?;
84                    buf.truncate(n);
85                    // Best-effort UTF-8; invalid bytes → replacement chars.
86                    String::from_utf8_lossy(&buf).into_owned()
87                } else {
88                    let mut s = String::new();
89                    entry
90                        .read_to_string(&mut s)
91                        .map_err(|e| OrbokError::Extraction {
92                            category: ErrorCategory::ParserError,
93                            message: format!("docx xml read: {e}"),
94                        })?;
95                    s
96                }
97            }
98            Err(_) => {
99                return Err(OrbokError::Extraction {
100                    category: ErrorCategory::UnsupportedFormat,
101                    message: "no word/document.xml in archive".into(),
102                });
103            }
104        };
105
106        // Extract text runs from w:p paragraphs.
107        let paragraphs = extract_paragraphs(&xml);
108        let mut segments = Vec::new();
109        let mut total_chars = 0u64;
110
111        for (para_idx, para_text) in paragraphs.iter().enumerate() {
112            // RFC-044 §9.5: segment and char limits.
113            if segments.len() >= limits.max_segments {
114                warnings.push(ExtractWarning::SizeLimitReached {
115                    limit_name: "max_segments".into(),
116                });
117                break;
118            }
119
120            let norm = normalize_document(para_text);
121            if norm.trim().is_empty() {
122                continue;
123            }
124            let para_chars = norm.chars().count() as u64;
125            if total_chars + para_chars > limits.max_extracted_chars {
126                warnings.push(ExtractWarning::SizeLimitReached {
127                    limit_name: "max_extracted_chars".into(),
128                });
129                break;
130            }
131            total_chars += para_chars;
132
133            segments.push(ExtractedSegment {
134                kind: SegmentKind::Paragraph,
135                text: norm,
136                line_start: (para_idx + 1) as u32,
137                line_end: (para_idx + 1) as u32,
138                location_kind: LocationKind::Paragraphs,
139                heading_path: None,
140                location_quality: LocationQuality::Approximate,
141            });
142        }
143
144        Ok(ExtractOutput {
145            extractor_name: EXTRACTOR_NAME.to_string(),
146            extractor_version: EXTRACTOR_VERSION.to_string(),
147            normalization_version: NORMALIZATION_VERSION.to_string(),
148            segments,
149            char_count: total_chars,
150            warnings,
151        })
152    }
153
154    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
155        self.extract_with_context(path, &ExtractContext::default())
156    }
157}
158
159/// Extract paragraph text from DOCX word/document.xml by collecting
160/// all `w:t` text runs within each `w:p` paragraph element.
161fn extract_paragraphs(xml: &str) -> Vec<String> {
162    let mut paragraphs = Vec::new();
163    let mut current_para = String::new();
164    let mut in_para = false;
165    let mut pos = 0;
166    let bytes = xml.as_bytes();
167
168    while pos < bytes.len() {
169        if bytes[pos] == b'<' {
170            let end = bytes[pos..]
171                .iter()
172                .position(|&b| b == b'>')
173                .map(|p| pos + p + 1)
174                .unwrap_or(bytes.len());
175            let tag = &xml[pos..end];
176            if tag.starts_with("<w:p ") || tag == "<w:p>" {
177                in_para = true;
178                current_para.clear();
179            } else if tag.starts_with("</w:p>") {
180                if in_para && !current_para.trim().is_empty() {
181                    paragraphs.push(current_para.trim().to_string());
182                }
183                in_para = false;
184                current_para.clear();
185            } else if in_para && (tag.starts_with("<w:t") || tag.starts_with("<w:t>")) {
186                let text_start = end;
187                let text_end = xml[text_start..]
188                    .find("</w:t>")
189                    .map(|p| text_start + p)
190                    .unwrap_or(text_start);
191                let text = &xml[text_start..text_end];
192                let clean: String = {
193                    let mut t = String::new();
194                    let mut in_tag = false;
195                    for c in text.chars() {
196                        match c {
197                            '<' => in_tag = true,
198                            '>' => in_tag = false,
199                            _ if !in_tag => t.push(c),
200                            _ => {}
201                        }
202                    }
203                    t
204                };
205                current_para.push_str(&clean);
206                pos = text_end;
207                continue;
208            }
209            pos = end;
210        } else {
211            pos += 1;
212        }
213    }
214    paragraphs
215}