1use crate::normalize::normalize_document;
12use crate::types::{
13 DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
14};
15use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
16use orbok_fs::ValidatedPath;
17use std::io::Read;
18
19const EXTRACTOR_NAME: &str = "docx";
20const EXTRACTOR_VERSION: &str = "v1";
21
22pub struct DocxExtractor;
23
24impl DocumentExtractor for DocxExtractor {
25 fn name(&self) -> &'static str {
26 EXTRACTOR_NAME
27 }
28 fn version(&self) -> &'static str {
29 EXTRACTOR_VERSION
30 }
31 fn supported_extensions(&self) -> &'static [&'static str] {
32 &["docx"]
33 }
34
35 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
36 let file = std::fs::File::open(&path.canonical)?;
37 let mut zip = zip::ZipArchive::new(file).map_err(|e| OrbokError::Extraction {
38 category: ErrorCategory::ParserError,
39 message: format!("docx zip: {e}"),
40 })?;
41
42 let xml = match zip.by_name("word/document.xml") {
43 Ok(mut entry) => {
44 let mut s = String::new();
45 entry
46 .read_to_string(&mut s)
47 .map_err(|e| OrbokError::Extraction {
48 category: ErrorCategory::ParserError,
49 message: format!("docx xml read: {e}"),
50 })?;
51 s
52 }
53 Err(_) => {
54 return Err(OrbokError::Extraction {
55 category: ErrorCategory::UnsupportedFormat,
56 message: "no word/document.xml in archive".into(),
57 });
58 }
59 };
60
61 let paragraphs = extract_paragraphs(&xml);
63 let mut segments = Vec::new();
64 let mut total_chars = 0u64;
65
66 for (para_idx, para_text) in paragraphs.iter().enumerate() {
67 let norm = normalize_document(para_text);
68 if norm.trim().is_empty() {
69 continue;
70 }
71 total_chars += norm.len() as u64;
72 segments.push(ExtractedSegment {
73 kind: SegmentKind::Paragraph,
74 text: norm,
75 line_start: (para_idx + 1) as u32,
76 line_end: (para_idx + 1) as u32,
77 heading_path: None,
78 location_quality: LocationQuality::Approximate,
79 });
80 }
81
82 Ok(ExtractOutput {
83 extractor_name: EXTRACTOR_NAME.to_string(),
84 extractor_version: EXTRACTOR_VERSION.to_string(),
85 normalization_version: NORMALIZATION_VERSION.to_string(),
86 segments,
87 char_count: total_chars,
88 })
89 }
90}
91
92fn extract_paragraphs(xml: &str) -> Vec<String> {
95 let mut paragraphs = Vec::new();
96 let mut current_para = String::new();
97 let mut in_para = false;
98 let mut pos = 0;
99 let bytes = xml.as_bytes();
100
101 while pos < bytes.len() {
102 if bytes[pos] == b'<' {
103 let end = bytes[pos..]
105 .iter()
106 .position(|&b| b == b'>')
107 .map(|p| pos + p + 1)
108 .unwrap_or(bytes.len());
109 let tag = &xml[pos..end];
110 if tag.starts_with("<w:p ") || tag == "<w:p>" {
111 in_para = true;
112 current_para.clear();
113 } else if tag.starts_with("</w:p>") {
114 if in_para && !current_para.trim().is_empty() {
115 paragraphs.push(current_para.trim().to_string());
116 }
117 in_para = false;
118 current_para.clear();
119 } else if in_para && (tag.starts_with("<w:t") || tag.starts_with("<w:t>")) {
120 let text_start = end;
122 let text_end = xml[text_start..]
123 .find("</w:t>")
124 .map(|p| text_start + p)
125 .unwrap_or(text_start);
126 let text = &xml[text_start..text_end];
127 let clean: String = {
129 let mut t = String::new();
130 let mut in_tag = false;
131 for c in text.chars() {
132 match c {
133 '<' => in_tag = true,
134 '>' => in_tag = false,
135 _ if !in_tag => t.push(c),
136 _ => {}
137 }
138 }
139 t
140 };
141 current_para.push_str(&clean);
142 pos = text_end;
143 continue;
144 }
145 pos = end;
146 } else {
147 pos += 1;
148 }
149 }
150 paragraphs
151}