#![cfg(feature = "office")]
use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::extraction::{cells_to_markdown, office_metadata};
use crate::plugins::{DocumentExtractor, Plugin};
#[cfg(feature = "tokio-runtime")]
use crate::types::PageBoundary;
use crate::types::{ExtractionResult, Metadata, PageInfo, PageStructure, PageUnitType, Table};
use ahash::AHashMap;
use async_trait::async_trait;
use std::borrow::Cow;
use std::io::Cursor;
pub struct DocxExtractor;
impl DocxExtractor {
pub fn new() -> Self {
Self
}
}
impl Default for DocxExtractor {
fn default() -> Self {
Self::new()
}
}
impl Plugin for DocxExtractor {
fn name(&self) -> &str {
"docx-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
fn description(&self) -> &str {
"High-performance DOCX text extraction with metadata support"
}
fn author(&self) -> &str {
"Kreuzberg Team"
}
}
fn convert_docx_table_to_table(docx_table: &crate::extraction::docx::parser::Table, table_index: usize) -> Table {
let cells: Vec<Vec<String>> = docx_table
.rows
.iter()
.map(|row| {
row.cells
.iter()
.map(|cell| {
cell.paragraphs
.iter()
.map(|para| para.runs_to_markdown())
.collect::<Vec<_>>()
.join(" ")
.trim()
.to_string()
})
.collect()
})
.collect();
let markdown = cells_to_markdown(&cells);
Table {
cells,
markdown,
page_number: table_index + 1,
}
}
#[async_trait]
impl DocumentExtractor for DocxExtractor {
#[cfg_attr(feature = "otel", tracing::instrument(
skip(self, content, _config),
fields(
extractor.name = self.name(),
content.size_bytes = content.len(),
)
))]
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
_config: &ExtractionConfig,
) -> Result<ExtractionResult> {
let (text, tables, page_boundaries) = {
#[cfg(feature = "tokio-runtime")]
if crate::core::batch_mode::is_batch_mode() {
let content_owned = content.to_vec();
let span = tracing::Span::current();
tokio::task::spawn_blocking(
move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
let _guard = span.entered();
let doc = crate::extraction::docx::parser::parse_document(&content_owned)?;
let text = doc.to_markdown();
let tables: Vec<Table> = doc
.tables
.iter()
.enumerate()
.map(|(idx, table)| convert_docx_table_to_table(table, idx))
.collect();
let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(&content_owned)?;
Ok((text, tables, page_boundaries))
},
)
.await
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
} else {
let doc = crate::extraction::docx::parser::parse_document(content)?;
let text = doc.to_markdown();
let tables: Vec<Table> = doc
.tables
.iter()
.enumerate()
.map(|(idx, table)| convert_docx_table_to_table(table, idx))
.collect();
let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
(text, tables, page_boundaries)
}
#[cfg(not(feature = "tokio-runtime"))]
{
let doc = crate::extraction::docx::parser::parse_document(content)?;
let text = doc.to_markdown();
let tables: Vec<Table> = doc
.tables
.iter()
.enumerate()
.map(|(idx, table)| convert_docx_table_to_table(table, idx))
.collect();
let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
(text, tables, page_boundaries)
}
};
let mut archive = {
#[cfg(feature = "tokio-runtime")]
if crate::core::batch_mode::is_batch_mode() {
let content_owned = content.to_vec();
let span = tracing::Span::current();
tokio::task::spawn_blocking(move || -> crate::error::Result<_> {
let _guard = span.entered();
let cursor = Cursor::new(content_owned);
zip::ZipArchive::new(cursor).map_err(|e| {
crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e))
})
})
.await
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Task join error: {}", e)))??
} else {
let content_owned = content.to_vec();
let cursor = Cursor::new(content_owned);
zip::ZipArchive::new(cursor)
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
}
#[cfg(not(feature = "tokio-runtime"))]
{
let content_owned = content.to_vec();
let cursor = Cursor::new(content_owned);
zip::ZipArchive::new(cursor)
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
}
};
let mut metadata_map = AHashMap::new();
let mut parsed_keywords: Option<Vec<String>> = None;
if let Ok(core) = office_metadata::extract_core_properties(&mut archive) {
if let Some(title) = core.title {
metadata_map.insert(Cow::Borrowed("title"), serde_json::Value::String(title));
}
if let Some(creator) = core.creator {
metadata_map.insert(
Cow::Borrowed("authors"),
serde_json::Value::Array(vec![serde_json::Value::String(creator.clone())]),
);
metadata_map.insert(Cow::Borrowed("created_by"), serde_json::Value::String(creator));
}
if let Some(subject) = core.subject {
metadata_map.insert(Cow::Borrowed("subject"), serde_json::Value::String(subject));
}
if let Some(keywords) = core.keywords {
parsed_keywords = Some(
keywords
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect(),
);
}
if let Some(description) = core.description {
metadata_map.insert(Cow::Borrowed("description"), serde_json::Value::String(description));
}
if let Some(modified_by) = core.last_modified_by {
metadata_map.insert(Cow::Borrowed("modified_by"), serde_json::Value::String(modified_by));
}
if let Some(created) = core.created {
metadata_map.insert(Cow::Borrowed("created_at"), serde_json::Value::String(created));
}
if let Some(modified) = core.modified {
metadata_map.insert(Cow::Borrowed("modified_at"), serde_json::Value::String(modified));
}
if let Some(revision) = core.revision {
metadata_map.insert(Cow::Borrowed("revision"), serde_json::Value::String(revision));
}
if let Some(category) = core.category {
metadata_map.insert(Cow::Borrowed("category"), serde_json::Value::String(category));
}
if let Some(content_status) = core.content_status {
metadata_map.insert(
Cow::Borrowed("content_status"),
serde_json::Value::String(content_status),
);
}
if let Some(language) = core.language {
metadata_map.insert(Cow::Borrowed("language"), serde_json::Value::String(language));
}
}
if let Ok(app) = office_metadata::extract_docx_app_properties(&mut archive) {
if let Some(pages) = app.pages {
metadata_map.insert(Cow::Borrowed("page_count"), serde_json::Value::Number(pages.into()));
}
if let Some(words) = app.words {
metadata_map.insert(Cow::Borrowed("word_count"), serde_json::Value::Number(words.into()));
}
if let Some(chars) = app.characters {
metadata_map.insert(
Cow::Borrowed("character_count"),
serde_json::Value::Number(chars.into()),
);
}
if let Some(lines) = app.lines {
metadata_map.insert(Cow::Borrowed("line_count"), serde_json::Value::Number(lines.into()));
}
if let Some(paragraphs) = app.paragraphs {
metadata_map.insert(
Cow::Borrowed("paragraph_count"),
serde_json::Value::Number(paragraphs.into()),
);
}
if let Some(template) = app.template {
metadata_map.insert(Cow::Borrowed("template"), serde_json::Value::String(template));
}
if let Some(company) = app.company {
metadata_map.insert(Cow::Borrowed("company"), serde_json::Value::String(company));
}
if let Some(time) = app.total_time {
metadata_map.insert(
Cow::Borrowed("total_editing_time_minutes"),
serde_json::Value::Number(time.into()),
);
}
if let Some(application) = app.application {
metadata_map.insert(Cow::Borrowed("application"), serde_json::Value::String(application));
}
}
if let Ok(custom) = office_metadata::extract_custom_properties(&mut archive) {
for (key, value) in custom {
metadata_map.insert(Cow::Owned(format!("custom_{}", key)), value);
}
}
let page_structure = if let Some(boundaries) = page_boundaries {
let total_count = boundaries.len();
Some(PageStructure {
total_count,
unit_type: PageUnitType::Page,
boundaries: Some(boundaries),
pages: Some(
(1..=total_count)
.map(|page_num| PageInfo {
number: page_num,
title: None,
dimensions: None,
image_count: None,
table_count: None,
hidden: None,
is_blank: None,
})
.collect(),
),
})
} else {
None
};
Ok(ExtractionResult {
content: text,
mime_type: mime_type.to_string().into(),
metadata: Metadata {
pages: page_structure,
keywords: parsed_keywords,
additional: metadata_map,
..Default::default()
},
pages: None,
tables,
detected_languages: None,
chunks: None,
images: Some(vec![]),
djot_content: None,
elements: None,
ocr_elements: None,
document: None,
})
}
fn supported_mime_types(&self) -> &[&str] {
&["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
}
fn priority(&self) -> i32 {
50
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_docx_extractor_plugin_interface() {
let extractor = DocxExtractor::new();
assert_eq!(extractor.name(), "docx-extractor");
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
assert_eq!(extractor.priority(), 50);
assert_eq!(extractor.supported_mime_types().len(), 1);
}
#[tokio::test]
async fn test_docx_extractor_supports_docx() {
let extractor = DocxExtractor::new();
assert!(
extractor
.supported_mime_types()
.contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document")
);
}
#[tokio::test]
async fn test_docx_extractor_default() {
let extractor = DocxExtractor;
assert_eq!(extractor.name(), "docx-extractor");
}
#[tokio::test]
async fn test_docx_extractor_initialize_shutdown() {
let extractor = DocxExtractor::new();
assert!(extractor.initialize().is_ok());
assert!(extractor.shutdown().is_ok());
}
#[test]
fn test_convert_docx_table_to_table() {
use crate::extraction::docx::parser::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
let mut table = DocxTable::new();
let mut header_row = TableRow::default();
let mut cell1 = TableCell::default();
let mut para1 = Paragraph::new();
para1.add_run(Run::new("Name".to_string()));
cell1.paragraphs.push(para1);
header_row.cells.push(cell1);
let mut cell2 = TableCell::default();
let mut para2 = Paragraph::new();
para2.add_run(Run::new("Age".to_string()));
cell2.paragraphs.push(para2);
header_row.cells.push(cell2);
table.rows.push(header_row);
let mut data_row = TableRow::default();
let mut cell3 = TableCell::default();
let mut para3 = Paragraph::new();
para3.add_run(Run::new("Alice".to_string()));
cell3.paragraphs.push(para3);
data_row.cells.push(cell3);
let mut cell4 = TableCell::default();
let mut para4 = Paragraph::new();
para4.add_run(Run::new("30".to_string()));
cell4.paragraphs.push(para4);
data_row.cells.push(cell4);
table.rows.push(data_row);
let result = convert_docx_table_to_table(&table, 0);
assert_eq!(result.page_number, 1);
assert_eq!(result.cells.len(), 2);
assert_eq!(result.cells[0], vec!["Name", "Age"]);
assert_eq!(result.cells[1], vec!["Alice", "30"]);
assert!(result.markdown.contains("| Name | Age |"));
assert!(result.markdown.contains("| Alice | 30 |"));
}
}