orbok_extract/
docx.rs

1//! DOCX text extractor (Microsoft Word 2007+).
2//!
3//! DOCX files are ZIP archives containing XML. This extractor reads
4//! `word/document.xml` and strips XML tags to recover paragraph text.
5//! Location quality is `Approximate` — DOCX does not provide byte
6//! offsets; only paragraph order is preserved.
7//!
8//! Security: the file is opened with `zip::ZipArchive` which bounds
9//! reads to the archive contents. No external entity expansion.
10
11use crate::normalize::normalize_document;
12use crate::types::{
13    DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
14};
15use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
16use orbok_fs::ValidatedPath;
17use std::io::Read;
18
19const EXTRACTOR_NAME: &str = "docx";
20const EXTRACTOR_VERSION: &str = "v1";
21
22pub struct DocxExtractor;
23
24impl DocumentExtractor for DocxExtractor {
25    fn name(&self) -> &'static str { EXTRACTOR_NAME }
26    fn version(&self) -> &'static str { EXTRACTOR_VERSION }
27    fn supported_extensions(&self) -> &'static [&'static str] { &["docx"] }
28
29    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
30        let file = std::fs::File::open(&path.canonical)?;
31        let mut zip = zip::ZipArchive::new(file).map_err(|e| OrbokError::Extraction {
32            category: ErrorCategory::ParserError,
33            message: format!("docx zip: {e}"),
34        })?;
35
36        let xml = match zip.by_name("word/document.xml") {
37            Ok(mut entry) => {
38                let mut s = String::new();
39                entry.read_to_string(&mut s).map_err(|e| OrbokError::Extraction {
40                    category: ErrorCategory::ParserError,
41                    message: format!("docx xml read: {e}"),
42                })?;
43                s
44            }
45            Err(_) => return Err(OrbokError::Extraction {
46                category: ErrorCategory::UnsupportedFormat,
47                message: "no word/document.xml in archive".into(),
48            }),
49        };
50
51        // Extract text runs from w:p paragraphs.
52        let paragraphs = extract_paragraphs(&xml);
53        let mut segments = Vec::new();
54        let mut total_chars = 0u64;
55
56        for (para_idx, para_text) in paragraphs.iter().enumerate() {
57            let norm = normalize_document(para_text);
58            if norm.trim().is_empty() { continue; }
59            total_chars += norm.len() as u64;
60            segments.push(ExtractedSegment {
61                kind: SegmentKind::Paragraph,
62                text: norm,
63                line_start: (para_idx + 1) as u32,
64                line_end: (para_idx + 1) as u32,
65                heading_path: None,
66                location_quality: LocationQuality::Approximate,
67            });
68        }
69
70        Ok(ExtractOutput {
71            extractor_name: EXTRACTOR_NAME.to_string(),
72            extractor_version: EXTRACTOR_VERSION.to_string(),
73            normalization_version: NORMALIZATION_VERSION.to_string(),
74            segments,
75            char_count: total_chars,
76        })
77    }
78}
79
80/// Extract paragraph text from DOCX word/document.xml by collecting
81/// all `w:t` text runs within each `w:p` paragraph element.
82fn extract_paragraphs(xml: &str) -> Vec<String> {
83    let mut paragraphs = Vec::new();
84    let mut current_para = String::new();
85    let mut in_para = false;
86    let mut pos = 0;
87    let bytes = xml.as_bytes();
88
89    while pos < bytes.len() {
90        if bytes[pos] == b'<' {
91            // Find end of tag
92            let end = bytes[pos..].iter().position(|&b| b == b'>').map(|p| pos + p + 1).unwrap_or(bytes.len());
93            let tag = &xml[pos..end];
94            if tag.starts_with("<w:p ") || tag == "<w:p>" {
95                in_para = true;
96                current_para.clear();
97            } else if tag.starts_with("</w:p>") {
98                if in_para && !current_para.trim().is_empty() {
99                    paragraphs.push(current_para.trim().to_string());
100                }
101                in_para = false;
102                current_para.clear();
103            } else if in_para && (tag.starts_with("<w:t") || tag.starts_with("<w:t>")) {
104                // Collect text until </w:t>
105                let text_start = end;
106                let text_end = xml[text_start..].find("</w:t>").map(|p| text_start + p).unwrap_or(text_start);
107                let text = &xml[text_start..text_end];
108                // Skip any nested tags in text content
109                let clean: String = {
110                    let mut t = String::new();
111                    let mut in_tag = false;
112                    for c in text.chars() {
113                        match c {
114                            '<' => in_tag = true,
115                            '>' => in_tag = false,
116                            _ if !in_tag => t.push(c),
117                            _ => {}
118                        }
119                    }
120                    t
121                };
122                current_para.push_str(&clean);
123                pos = text_end;
124                continue;
125            }
126            pos = end;
127        } else {
128            pos += 1;
129        }
130    }
131    paragraphs
132}
orbok_extract/docx.rs

orbok_extract/
docx.rs