use std::io::{Cursor, Read};
use quick_xml::events::Event;
use quick_xml::name::ResolveResult;
use quick_xml::reader::NsReader;
use super::super::LoaderError;
const WORD_NS: &[u8] = b"http://schemas.openxmlformats.org/wordprocessingml/2006/main";
pub(crate) fn extract(bytes: &[u8]) -> Result<String, LoaderError> {
let cursor = Cursor::new(bytes);
let mut archive = zip::ZipArchive::new(cursor)
.map_err(|e| LoaderError::ExtractionFailed(format!("Failed to open DOCX as ZIP: {e}")))?;
let mut xml_content = String::new();
{
let mut file = archive.by_name("word/document.xml").map_err(|e| {
LoaderError::ExtractionFailed(format!("Failed to find word/document.xml in DOCX: {e}"))
})?;
file.read_to_string(&mut xml_content).map_err(|e| {
LoaderError::ExtractionFailed(format!("Failed to read word/document.xml: {e}"))
})?;
}
parse_docx_xml(&xml_content)
}
fn parse_docx_xml(xml: &str) -> Result<String, LoaderError> {
let mut reader = NsReader::from_str(xml);
let mut paragraphs: Vec<String> = Vec::new();
let mut current_paragraph = String::new();
let mut in_paragraph = false;
let mut in_text = false;
loop {
match reader.read_resolved_event() {
Ok((resolved, Event::Start(ref e))) => {
let local = e.local_name();
let is_word =
matches!(resolved, ResolveResult::Bound(ns) if ns.as_ref() == WORD_NS);
if local.as_ref() == b"p" && is_word {
in_paragraph = true;
current_paragraph.clear();
} else if local.as_ref() == b"t" && is_word && in_paragraph {
in_text = true;
}
}
Ok((_, Event::End(ref e))) => {
let local = e.local_name();
if local.as_ref() == b"t" {
in_text = false;
} else if local.as_ref() == b"p" && in_paragraph {
in_paragraph = false;
let trimmed = current_paragraph.trim().to_string();
if !trimmed.is_empty() {
paragraphs.push(trimmed);
}
}
}
Ok((_, Event::Text(ref e))) if in_text => {
let raw = std::str::from_utf8(e.as_ref()).map_err(|err| {
LoaderError::ExtractionFailed(format!("Invalid UTF-8 in DOCX XML: {err}"))
})?;
current_paragraph.push_str(raw);
}
Ok((_, Event::GeneralRef(ref e))) if in_text => {
current_paragraph.push_str(resolve_xml_entity(e.as_ref()));
}
Ok((_, Event::Eof)) => break,
Err(e) => {
return Err(LoaderError::ExtractionFailed(format!(
"XML parse error in DOCX: {e}"
)));
}
_ => {}
}
}
Ok(paragraphs.join("\n\n"))
}
fn resolve_xml_entity(name: &[u8]) -> &'static str {
match name {
b"amp" => "&",
b"lt" => "<",
b"gt" => ">",
b"apos" => "'",
b"quot" => "\"",
_ => "",
}
}
#[cfg(test)]
#[allow(
clippy::unwrap_used,
clippy::expect_used,
reason = "test code — panics are acceptable failures"
)]
mod tests {
use super::*;
#[test]
fn parse_simple_docx_xml() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r><w:t>Hello world</w:t></w:r>
</w:p>
<w:p>
<w:r><w:t>Second paragraph</w:t></w:r>
</w:p>
<w:p>
<w:r><w:t> </w:t></w:r>
</w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Hello world\n\nSecond paragraph");
}
#[test]
fn parse_multiple_runs_in_paragraph() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r><w:t>Hello </w:t></w:r>
<w:r><w:t>world</w:t></w:r>
</w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Hello world");
}
#[test]
fn parse_entities_in_text() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r><w:t>A & B</w:t></w:r>
</w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "A & B");
}
#[test]
fn invalid_zip_returns_error() {
let result = extract(b"not a zip file");
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
matches!(err, LoaderError::ExtractionFailed(_)),
"expected ExtractionFailed, got {err:?}"
);
}
#[test]
fn empty_bytes_returns_error() {
let result = extract(b"");
assert!(result.is_err());
}
}