1use crate::normalize::normalize_document;
12use crate::types::{
13 DocumentExtractor, ExtractContext, ExtractOutput, ExtractWarning, ExtractedSegment,
14 LocationKind, LocationQuality, SegmentKind, read_error_category,
15};
16use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
17use orbok_fs::ValidatedPath;
18use std::io::Read;
19
20const EXTRACTOR_NAME: &str = "docx";
21const EXTRACTOR_VERSION: &str = "v1";
22
23pub struct DocxExtractor;
24
25impl DocumentExtractor for DocxExtractor {
26 fn name(&self) -> &'static str {
27 EXTRACTOR_NAME
28 }
29
30 fn version(&self) -> &'static str {
31 EXTRACTOR_VERSION
32 }
33
34 fn supported_extensions(&self) -> &'static [&'static str] {
35 &["docx"]
36 }
37
38 fn extract_with_context(
39 &self,
40 path: &ValidatedPath,
41 context: &ExtractContext,
42 ) -> OrbokResult<ExtractOutput> {
43 let limits = &context.limits;
44 let mut warnings = Vec::new();
45
46 let meta = std::fs::metadata(&path.canonical).map_err(|e| OrbokError::Extraction {
48 category: read_error_category(&e),
49 message: e.to_string(),
50 })?;
51 if meta.len() > limits.max_zip_entry_bytes {
52 return Err(OrbokError::Extraction {
53 category: ErrorCategory::FileTooLarge,
54 message: format!(
55 "DOCX file is {} bytes, limit is {}",
56 meta.len(),
57 limits.max_zip_entry_bytes
58 ),
59 });
60 }
61
62 let file = std::fs::File::open(&path.canonical).map_err(|e| OrbokError::Extraction {
63 category: read_error_category(&e),
64 message: e.to_string(),
65 })?;
66 let mut zip = zip::ZipArchive::new(file).map_err(|e| OrbokError::Extraction {
67 category: ErrorCategory::ParserError,
68 message: format!("docx zip: {e}"),
69 })?;
70
71 let xml = match zip.by_name("word/document.xml") {
73 Ok(mut entry) => {
74 if entry.size() > limits.max_docx_xml_bytes {
75 warnings.push(ExtractWarning::SizeLimitReached {
76 limit_name: "max_docx_xml_bytes".into(),
77 });
78 let mut buf = vec![0u8; limits.max_docx_xml_bytes as usize];
80 let n = entry.read(&mut buf).map_err(|e| OrbokError::Extraction {
81 category: ErrorCategory::ParserError,
82 message: format!("docx xml read: {e}"),
83 })?;
84 buf.truncate(n);
85 String::from_utf8_lossy(&buf).into_owned()
87 } else {
88 let mut s = String::new();
89 entry
90 .read_to_string(&mut s)
91 .map_err(|e| OrbokError::Extraction {
92 category: ErrorCategory::ParserError,
93 message: format!("docx xml read: {e}"),
94 })?;
95 s
96 }
97 }
98 Err(_) => {
99 return Err(OrbokError::Extraction {
100 category: ErrorCategory::UnsupportedFormat,
101 message: "no word/document.xml in archive".into(),
102 });
103 }
104 };
105
106 let paragraphs = extract_paragraphs(&xml);
108 let mut segments = Vec::new();
109 let mut total_chars = 0u64;
110
111 for (para_idx, para_text) in paragraphs.iter().enumerate() {
112 if segments.len() >= limits.max_segments {
114 warnings.push(ExtractWarning::SizeLimitReached {
115 limit_name: "max_segments".into(),
116 });
117 break;
118 }
119
120 let norm = normalize_document(para_text);
121 if norm.trim().is_empty() {
122 continue;
123 }
124 let para_chars = norm.chars().count() as u64;
125 if total_chars + para_chars > limits.max_extracted_chars {
126 warnings.push(ExtractWarning::SizeLimitReached {
127 limit_name: "max_extracted_chars".into(),
128 });
129 break;
130 }
131 total_chars += para_chars;
132
133 segments.push(ExtractedSegment {
134 kind: SegmentKind::Paragraph,
135 text: norm,
136 line_start: (para_idx + 1) as u32,
137 line_end: (para_idx + 1) as u32,
138 location_kind: LocationKind::Paragraphs,
139 heading_path: None,
140 location_quality: LocationQuality::Approximate,
141 });
142 }
143
144 Ok(ExtractOutput {
145 extractor_name: EXTRACTOR_NAME.to_string(),
146 extractor_version: EXTRACTOR_VERSION.to_string(),
147 normalization_version: NORMALIZATION_VERSION.to_string(),
148 segments,
149 char_count: total_chars,
150 warnings,
151 })
152 }
153
154 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
155 self.extract_with_context(path, &ExtractContext::default())
156 }
157}
158
159fn extract_paragraphs(xml: &str) -> Vec<String> {
162 let mut paragraphs = Vec::new();
163 let mut current_para = String::new();
164 let mut in_para = false;
165 let mut pos = 0;
166 let bytes = xml.as_bytes();
167
168 while pos < bytes.len() {
169 if bytes[pos] == b'<' {
170 let end = bytes[pos..]
171 .iter()
172 .position(|&b| b == b'>')
173 .map(|p| pos + p + 1)
174 .unwrap_or(bytes.len());
175 let tag = &xml[pos..end];
176 if tag.starts_with("<w:p ") || tag == "<w:p>" {
177 in_para = true;
178 current_para.clear();
179 } else if tag.starts_with("</w:p>") {
180 if in_para && !current_para.trim().is_empty() {
181 paragraphs.push(current_para.trim().to_string());
182 }
183 in_para = false;
184 current_para.clear();
185 } else if in_para && (tag.starts_with("<w:t") || tag.starts_with("<w:t>")) {
186 let text_start = end;
187 let text_end = xml[text_start..]
188 .find("</w:t>")
189 .map(|p| text_start + p)
190 .unwrap_or(text_start);
191 let text = &xml[text_start..text_end];
192 let clean: String = {
193 let mut t = String::new();
194 let mut in_tag = false;
195 for c in text.chars() {
196 match c {
197 '<' => in_tag = true,
198 '>' => in_tag = false,
199 _ if !in_tag => t.push(c),
200 _ => {}
201 }
202 }
203 t
204 };
205 current_para.push_str(&clean);
206 pos = text_end;
207 continue;
208 }
209 pos = end;
210 } else {
211 pos += 1;
212 }
213 }
214 paragraphs
215}