use std::io::{Cursor, Read};
use std::path::Path;
use async_trait::async_trait;
use zip::ZipArchive;
use crate::Error;
use crate::error::Result;
use crate::parser::{DocumentFormat, DocumentMeta, DocumentParser, ParseResult, RawNode};
use super::styles::StyleResolver;
use super::types::DocxParagraph;
#[derive(Debug, Clone, Default)]
pub struct DocxParser;
impl DocxParser {
pub fn new() -> Self {
Self::default()
}
pub fn parse_file_sync(&self, path: &Path) -> Result<ParseResult> {
let bytes = std::fs::read(path)
.map_err(|e| Error::Parse(format!("Failed to read DOCX file: {}", e)))?;
self.parse_bytes(&bytes, path.file_stem().and_then(|s| s.to_str()))
}
pub fn parse_bytes(&self, bytes: &[u8], filename: Option<&str>) -> Result<ParseResult> {
let cursor = Cursor::new(bytes);
let mut archive = ZipArchive::new(cursor)
.map_err(|e| Error::Parse(format!("Failed to open DOCX archive: {}", e)))?;
let style_resolver = self.read_styles(&mut archive)?;
let document_xml = self.read_xml_file(&mut archive, "word/document.xml")?;
let paragraphs = self.parse_paragraphs(&document_xml, &style_resolver)?;
let nodes = self.build_raw_nodes(paragraphs)?;
let meta = DocumentMeta {
name: filename.unwrap_or("Document").to_string(),
format: DocumentFormat::Docx,
page_count: None,
line_count: nodes.len(),
source_path: None,
description: None,
};
Ok(ParseResult::new(meta, nodes))
}
fn read_styles(&self, archive: &mut ZipArchive<Cursor<&[u8]>>) -> Result<StyleResolver> {
match self.read_xml_file(archive, "word/styles.xml") {
Ok(xml) => Ok(StyleResolver::from_xml(&xml)),
Err(_) => {
Ok(StyleResolver::from_xml(""))
}
}
}
fn read_xml_file(&self, archive: &mut ZipArchive<Cursor<&[u8]>>, path: &str) -> Result<String> {
let mut file = archive
.by_name(path)
.map_err(|e| Error::Parse(format!("Failed to read {} from DOCX: {}", path, e)))?;
let mut content = String::new();
file.read_to_string(&mut content)
.map_err(|e| Error::Parse(format!("Failed to read {} content: {}", path, e)))?;
Ok(content)
}
const WORD_NS: &'static str = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
fn parse_paragraphs(
&self,
document_xml: &str,
style_resolver: &StyleResolver,
) -> Result<Vec<DocxParagraph>> {
let doc = roxmltree::Document::parse(document_xml)
.map_err(|e| Error::Parse(format!("Failed to parse document.xml: {}", e)))?;
let mut paragraphs = Vec::new();
for para_elem in doc
.descendants()
.filter(|n| n.has_tag_name((Self::WORD_NS, "p")))
{
if let Some(para) = self.parse_paragraph(¶_elem, style_resolver) {
paragraphs.push(para);
}
}
Ok(paragraphs)
}
fn parse_paragraph(
&self,
elem: &roxmltree::Node,
style_resolver: &StyleResolver,
) -> Option<DocxParagraph> {
let text = self.extract_text(elem);
if text.trim().is_empty() {
return None;
}
let mut para = DocxParagraph::new(text);
for child in elem.children() {
if child.has_tag_name((Self::WORD_NS, "pPr")) {
for ppr_child in child.children() {
if ppr_child.has_tag_name((Self::WORD_NS, "pStyle")) {
if let Some(style_id) = ppr_child.attribute((Self::WORD_NS, "val")) {
para.style_id = Some(style_id.to_string());
para.heading_level = style_resolver.get_heading_level(¶.style_id);
}
}
}
}
}
if para.heading_level.is_none() {
para.heading_level = style_resolver.detect_heading_by_heuristics(¶.text);
}
Some(para)
}
fn extract_text(&self, elem: &roxmltree::Node) -> String {
let mut text = String::new();
for text_elem in elem
.descendants()
.filter(|n| n.has_tag_name((Self::WORD_NS, "t")))
{
if let Some(t) = text_elem.text() {
text.push_str(t);
}
}
text
}
fn build_raw_nodes(&self, paragraphs: Vec<DocxParagraph>) -> Result<Vec<RawNode>> {
let mut nodes: Vec<RawNode> = Vec::new();
let mut current_sections: Vec<(u8, RawNode)> = Vec::new(); let mut has_headings = false;
let mut unassigned_text: Vec<String> = Vec::new();
for para in paragraphs {
if !para.has_content() {
continue;
}
if let Some(level) = para.heading_level {
has_headings = true;
if !unassigned_text.is_empty() {
if let Some((_, node)) = current_sections.last_mut() {
if !node.content.is_empty() {
node.content.push('\n');
}
node.content.push_str(&unassigned_text.join("\n"));
}
unassigned_text.clear();
}
self.finalize_deeper_sections(&mut current_sections, level);
let node = RawNode::new(¶.text).with_level(level as usize);
current_sections.push((level, node));
} else {
if current_sections.is_empty() {
unassigned_text.push(para.text);
} else {
if let Some((_, node)) = current_sections.last_mut() {
if !node.content.is_empty() {
node.content.push('\n');
}
node.content.push_str(¶.text);
}
}
}
}
while let Some((_level, node)) = current_sections.pop() {
nodes.insert(0, node);
}
if !has_headings {
let combined = unassigned_text.join("\n");
let node = RawNode::new("Document")
.with_content(combined)
.with_level(1);
return Ok(vec![node]);
}
Ok(nodes)
}
fn finalize_deeper_sections(&self, sections: &mut Vec<(u8, RawNode)>, new_level: u8) {
while let Some((level, _)) = sections.last() {
if *level >= new_level {
sections.pop();
} else {
break;
}
}
}
}
#[async_trait]
impl DocumentParser for DocxParser {
fn format(&self) -> DocumentFormat {
DocumentFormat::Docx
}
async fn parse(&self, content: &str) -> Result<ParseResult> {
let path = Path::new(content);
self.parse_file(path).await
}
async fn parse_file(&self, path: &Path) -> Result<ParseResult> {
let path = path.to_path_buf();
tokio::task::spawn_blocking(move || {
let parser = DocxParser::new();
parser.parse_file_sync(&path)
})
.await
.map_err(|e| Error::Parse(format!("DOCX parsing task failed: {}", e)))?
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parser_creation() {
let parser = DocxParser::new();
assert_eq!(parser.format(), DocumentFormat::Docx);
}
#[test]
fn test_extract_text() {
let parser = DocxParser::new();
let xml = r#"
<w:p xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:r>
<w:t>Hello</w:t>
</w:r>
<w:r>
<w:t> World</w:t>
</w:r>
</w:p>
"#;
let doc = roxmltree::Document::parse(xml).unwrap();
let elem = doc.root().first_child().unwrap();
let text = parser.extract_text(&elem);
assert_eq!(text, "Hello World");
}
#[test]
fn test_build_raw_nodes_no_headings() {
let parser = DocxParser::new();
let paragraphs = vec![
DocxParagraph::new("First paragraph"),
DocxParagraph::new("Second paragraph"),
];
let nodes = parser.build_raw_nodes(paragraphs).unwrap();
assert_eq!(nodes.len(), 1, "Should have exactly one node");
assert_eq!(
nodes[0].title, "Document",
"Node title should be 'Document'"
);
assert!(
nodes[0].content.contains("First paragraph"),
"Content should contain 'First paragraph', got: {:?}",
nodes[0].content
);
assert!(
nodes[0].content.contains("Second paragraph"),
"Content should contain 'Second paragraph', got: {:?}",
nodes[0].content
);
}
#[test]
fn test_build_raw_nodes_with_headings() {
let parser = DocxParser::new();
let mut para1 = DocxParagraph::new("Introduction");
para1.heading_level = Some(1);
let para2 = DocxParagraph::new("This is the intro content.");
let mut para3 = DocxParagraph::new("Details");
para3.heading_level = Some(2);
let para4 = DocxParagraph::new("More details here.");
let paragraphs = vec![para1, para2, para3, para4];
let nodes = parser.build_raw_nodes(paragraphs).unwrap();
assert!(nodes.len() >= 2);
assert!(nodes.iter().any(|n| n.title == "Introduction"));
assert!(nodes.iter().any(|n| n.title == "Details"));
}
}