1use crate::normalize::normalize_document;
12use crate::types::{
13 DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
14};
15use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
16use orbok_fs::ValidatedPath;
17use std::io::Read;
18
19const EXTRACTOR_NAME: &str = "docx";
20const EXTRACTOR_VERSION: &str = "v1";
21
22pub struct DocxExtractor;
23
24impl DocumentExtractor for DocxExtractor {
25 fn name(&self) -> &'static str { EXTRACTOR_NAME }
26 fn version(&self) -> &'static str { EXTRACTOR_VERSION }
27 fn supported_extensions(&self) -> &'static [&'static str] { &["docx"] }
28
29 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
30 let file = std::fs::File::open(&path.canonical)?;
31 let mut zip = zip::ZipArchive::new(file).map_err(|e| OrbokError::Extraction {
32 category: ErrorCategory::ParserError,
33 message: format!("docx zip: {e}"),
34 })?;
35
36 let xml = match zip.by_name("word/document.xml") {
37 Ok(mut entry) => {
38 let mut s = String::new();
39 entry.read_to_string(&mut s).map_err(|e| OrbokError::Extraction {
40 category: ErrorCategory::ParserError,
41 message: format!("docx xml read: {e}"),
42 })?;
43 s
44 }
45 Err(_) => return Err(OrbokError::Extraction {
46 category: ErrorCategory::UnsupportedFormat,
47 message: "no word/document.xml in archive".into(),
48 }),
49 };
50
51 let paragraphs = extract_paragraphs(&xml);
53 let mut segments = Vec::new();
54 let mut total_chars = 0u64;
55
56 for (para_idx, para_text) in paragraphs.iter().enumerate() {
57 let norm = normalize_document(para_text);
58 if norm.trim().is_empty() { continue; }
59 total_chars += norm.len() as u64;
60 segments.push(ExtractedSegment {
61 kind: SegmentKind::Paragraph,
62 text: norm,
63 line_start: (para_idx + 1) as u32,
64 line_end: (para_idx + 1) as u32,
65 heading_path: None,
66 location_quality: LocationQuality::Approximate,
67 });
68 }
69
70 Ok(ExtractOutput {
71 extractor_name: EXTRACTOR_NAME.to_string(),
72 extractor_version: EXTRACTOR_VERSION.to_string(),
73 normalization_version: NORMALIZATION_VERSION.to_string(),
74 segments,
75 char_count: total_chars,
76 })
77 }
78}
79
80fn extract_paragraphs(xml: &str) -> Vec<String> {
83 let mut paragraphs = Vec::new();
84 let mut current_para = String::new();
85 let mut in_para = false;
86 let mut pos = 0;
87 let bytes = xml.as_bytes();
88
89 while pos < bytes.len() {
90 if bytes[pos] == b'<' {
91 let end = bytes[pos..].iter().position(|&b| b == b'>').map(|p| pos + p + 1).unwrap_or(bytes.len());
93 let tag = &xml[pos..end];
94 if tag.starts_with("<w:p ") || tag == "<w:p>" {
95 in_para = true;
96 current_para.clear();
97 } else if tag.starts_with("</w:p>") {
98 if in_para && !current_para.trim().is_empty() {
99 paragraphs.push(current_para.trim().to_string());
100 }
101 in_para = false;
102 current_para.clear();
103 } else if in_para && (tag.starts_with("<w:t") || tag.starts_with("<w:t>")) {
104 let text_start = end;
106 let text_end = xml[text_start..].find("</w:t>").map(|p| text_start + p).unwrap_or(text_start);
107 let text = &xml[text_start..text_end];
108 let clean: String = {
110 let mut t = String::new();
111 let mut in_tag = false;
112 for c in text.chars() {
113 match c {
114 '<' => in_tag = true,
115 '>' => in_tag = false,
116 _ if !in_tag => t.push(c),
117 _ => {}
118 }
119 }
120 t
121 };
122 current_para.push_str(&clean);
123 pos = text_end;
124 continue;
125 }
126 pos = end;
127 } else {
128 pos += 1;
129 }
130 }
131 paragraphs
132}