#![cfg(feature = "office")]
use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::extraction::{cells_to_markdown, office_metadata};
use crate::plugins::{DocumentExtractor, Plugin};
use crate::types::{ExtractionResult, Metadata, Table};
use ahash::AHashMap;
use async_trait::async_trait;
use roxmltree::Document;
use std::borrow::Cow;
use std::io::Cursor;
pub struct OdtExtractor;
impl OdtExtractor {
pub fn new() -> Self {
Self
}
}
impl Default for OdtExtractor {
fn default() -> Self {
Self::new()
}
}
impl Plugin for OdtExtractor {
fn name(&self) -> &str {
"odt-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
fn description(&self) -> &str {
"Native Rust ODT (OpenDocument Text) extractor with metadata and table support"
}
fn author(&self) -> &str {
"Kreuzberg Team"
}
}
fn extract_mathml_text(math_node: roxmltree::Node) -> Option<String> {
for node in math_node.descendants() {
if node.tag_name().name() == "annotation"
&& let Some(encoding) = node.attribute("encoding")
&& encoding.contains("StarMath")
&& let Some(text) = node.text()
{
return Some(text.to_string());
}
}
let mut formula_parts = Vec::new();
for node in math_node.descendants() {
match node.tag_name().name() {
"mi" | "mo" | "mn" | "ms" | "mtext" => {
if let Some(text) = node.text() {
formula_parts.push(text.to_string());
}
}
_ => {}
}
}
if !formula_parts.is_empty() {
Some(formula_parts.join(" "))
} else {
None
}
}
fn extract_embedded_formulas(archive: &mut zip::ZipArchive<Cursor<Vec<u8>>>) -> crate::error::Result<String> {
use std::io::Read;
let mut formula_parts = Vec::new();
let file_names: Vec<String> = archive.file_names().map(|s| s.to_string()).collect();
for file_name in file_names {
if file_name.contains("Object")
&& file_name.ends_with("content.xml")
&& let Ok(mut file) = archive.by_name(&file_name)
{
let mut xml_content = String::new();
if file.read_to_string(&mut xml_content).is_ok()
&& let Ok(doc) = Document::parse(&xml_content)
{
let root = doc.root_element();
if root.tag_name().name() == "math" {
if let Some(formula_text) = extract_mathml_text(root) {
formula_parts.push(formula_text);
}
} else {
for node in root.descendants() {
if node.tag_name().name() == "math"
&& let Some(formula_text) = extract_mathml_text(node)
{
formula_parts.push(formula_text);
}
}
}
}
}
}
Ok(formula_parts.join("\n"))
}
fn extract_content_text(archive: &mut zip::ZipArchive<Cursor<Vec<u8>>>) -> crate::error::Result<String> {
let mut xml_content = String::new();
match archive.by_name("content.xml") {
Ok(mut file) => {
use std::io::Read;
file.read_to_string(&mut xml_content)
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to read content.xml: {}", e)))?;
}
Err(_) => {
return Ok(String::new());
}
}
let doc = Document::parse(&xml_content)
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse content.xml: {}", e)))?;
let root = doc.root_element();
let mut text_parts: Vec<String> = Vec::new();
for body_child in root.children() {
if body_child.tag_name().name() == "body" {
for text_elem in body_child.children() {
if text_elem.tag_name().name() == "text" {
process_document_elements(text_elem, &mut text_parts);
}
}
}
}
Ok(text_parts.join("\n").trim().to_string())
}
fn process_document_elements(parent: roxmltree::Node, text_parts: &mut Vec<String>) {
for node in parent.children() {
match node.tag_name().name() {
"h" => {
if let Some(text) = extract_node_text(node)
&& !text.trim().is_empty()
{
text_parts.push(format!("# {}", text.trim()));
text_parts.push(String::new());
}
}
"p" => {
if let Some(text) = extract_node_text(node)
&& !text.trim().is_empty()
{
text_parts.push(text.trim().to_string());
text_parts.push(String::new());
}
}
"table" => {
if let Some(table_text) = extract_table_text(node) {
text_parts.push(table_text);
text_parts.push(String::new());
}
}
"list" => {
process_list_elements(node, text_parts, 0);
text_parts.push(String::new());
}
"section" => {
process_document_elements(node, text_parts);
}
_ => {}
}
}
}
fn process_list_elements(list_node: roxmltree::Node, text_parts: &mut Vec<String>, depth: usize) {
let indent = " ".repeat(depth);
for item in list_node.children() {
if item.tag_name().name() == "list-item" {
for child in item.children() {
match child.tag_name().name() {
"p" => {
if let Some(text) = extract_node_text(child)
&& !text.trim().is_empty()
{
text_parts.push(format!("{indent}- {}", text.trim()));
}
}
"h" => {
if let Some(text) = extract_node_text(child)
&& !text.trim().is_empty()
{
text_parts.push(format!("{indent}- # {}", text.trim()));
}
}
"list" => {
process_list_elements(child, text_parts, depth + 1);
}
_ => {}
}
}
}
}
}
fn extract_node_text(node: roxmltree::Node) -> Option<String> {
let mut text_parts = Vec::new();
for child in node.children() {
match child.tag_name().name() {
"span" => {
if let Some(text) = child.text() {
text_parts.push(text.to_string());
}
}
"tab" => {
text_parts.push("\t".to_string());
}
"line-break" => {
text_parts.push("\n".to_string());
}
_ => {
if let Some(text) = child.text() {
text_parts.push(text.to_string());
}
}
}
}
if text_parts.is_empty() {
node.text().map(|s| s.to_string())
} else {
Some(text_parts.join(""))
}
}
fn extract_table_text(table_node: roxmltree::Node) -> Option<String> {
let mut rows = Vec::new();
let mut max_cols = 0;
for row_node in table_node.children() {
if row_node.tag_name().name() == "table-row" {
let mut row_cells = Vec::new();
for cell_node in row_node.children() {
if cell_node.tag_name().name() == "table-cell" {
let cell_text = extract_node_text(cell_node).unwrap_or_default();
row_cells.push(cell_text.trim().to_string());
}
}
if !row_cells.is_empty() {
max_cols = max_cols.max(row_cells.len());
rows.push(row_cells);
}
}
}
if rows.is_empty() {
return None;
}
for row in &mut rows {
while row.len() < max_cols {
row.push(String::new());
}
}
let mut markdown = String::new();
if !rows.is_empty() {
markdown.push('|');
for cell in &rows[0] {
markdown.push(' ');
markdown.push_str(cell);
markdown.push_str(" |");
}
markdown.push('\n');
markdown.push('|');
for _ in 0..rows[0].len() {
markdown.push_str(" --- |");
}
markdown.push('\n');
for row in rows.iter().skip(1) {
markdown.push('|');
for cell in row {
markdown.push(' ');
markdown.push_str(cell);
markdown.push_str(" |");
}
markdown.push('\n');
}
}
Some(markdown)
}
fn extract_tables(archive: &mut zip::ZipArchive<Cursor<Vec<u8>>>) -> crate::error::Result<Vec<Table>> {
let mut xml_content = String::new();
match archive.by_name("content.xml") {
Ok(mut file) => {
use std::io::Read;
file.read_to_string(&mut xml_content)
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to read content.xml: {}", e)))?;
}
Err(_) => {
return Ok(Vec::new());
}
}
let doc = Document::parse(&xml_content)
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to parse content.xml: {}", e)))?;
let root = doc.root_element();
let mut tables = Vec::new();
let mut table_index = 0;
for node in root.descendants() {
if node.tag_name().name() == "table"
&& let Some(table) = parse_odt_table(node, table_index)
{
tables.push(table);
table_index += 1;
}
}
Ok(tables)
}
fn parse_odt_table(table_node: roxmltree::Node, table_index: usize) -> Option<Table> {
let mut cells: Vec<Vec<String>> = Vec::new();
for row_node in table_node.children() {
if row_node.tag_name().name() == "table-row" {
let mut row_cells = Vec::new();
for cell_node in row_node.children() {
if cell_node.tag_name().name() == "table-cell" {
let cell_text = extract_node_text(cell_node).unwrap_or_default();
row_cells.push(cell_text.trim().to_string());
}
}
if !row_cells.is_empty() {
cells.push(row_cells);
}
}
}
if cells.is_empty() {
return None;
}
let markdown = cells_to_markdown(&cells);
Some(Table {
cells,
markdown,
page_number: table_index + 1,
bounding_box: None,
})
}
#[async_trait]
impl DocumentExtractor for OdtExtractor {
#[cfg_attr(
feature = "otel",
tracing::instrument(
skip(self, content, _config),
fields(
extractor.name = self.name(),
content.size_bytes = content.len(),
)
)
)]
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
_config: &ExtractionConfig,
) -> Result<ExtractionResult> {
let content_owned = content.to_vec();
let (text, tables) = {
#[cfg(feature = "tokio-runtime")]
if crate::core::batch_mode::is_batch_mode() {
let content_for_task = content_owned.clone();
let span = tracing::Span::current();
tokio::task::spawn_blocking(move || -> crate::error::Result<(String, Vec<Table>)> {
let _guard = span.entered();
let cursor = Cursor::new(content_for_task);
let mut archive = zip::ZipArchive::new(cursor).map_err(|e| {
crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e))
})?;
let text = extract_content_text(&mut archive)?;
let tables = extract_tables(&mut archive)?;
let embedded_formulas = extract_embedded_formulas(&mut archive)?;
let combined_text = if !embedded_formulas.is_empty() {
if !text.is_empty() {
format!("{}\n{}", text, embedded_formulas)
} else {
embedded_formulas
}
} else {
text
};
Ok((combined_text, tables))
})
.await
.map_err(|e| crate::error::KreuzbergError::parsing(format!("ODT extraction task failed: {}", e)))??
} else {
let cursor = Cursor::new(content_owned.clone());
let mut archive = zip::ZipArchive::new(cursor)
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?;
let text = extract_content_text(&mut archive)?;
let tables = extract_tables(&mut archive)?;
let embedded_formulas = extract_embedded_formulas(&mut archive)?;
let combined_text = if !embedded_formulas.is_empty() {
if !text.is_empty() {
format!("{}\n{}", text, embedded_formulas)
} else {
embedded_formulas
}
} else {
text
};
(combined_text, tables)
}
#[cfg(not(feature = "tokio-runtime"))]
{
let cursor = Cursor::new(content_owned.clone());
let mut archive = zip::ZipArchive::new(cursor)
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?;
let text = extract_content_text(&mut archive)?;
let tables = extract_tables(&mut archive)?;
let embedded_formulas = extract_embedded_formulas(&mut archive)?;
let combined_text = if !embedded_formulas.is_empty() {
if !text.is_empty() {
format!("{}\n{}", text, embedded_formulas)
} else {
embedded_formulas
}
} else {
text
};
(combined_text, tables)
}
};
let mut metadata_map = AHashMap::new();
let cursor = Cursor::new(content_owned.clone());
let mut archive = zip::ZipArchive::new(cursor).map_err(|e| {
crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive for metadata: {}", e))
})?;
if let Ok(odt_props) = office_metadata::extract_odt_properties(&mut archive) {
if let Some(title) = odt_props.title {
metadata_map.insert(Cow::Borrowed("title"), serde_json::Value::String(title));
}
if let Some(creator) = odt_props.creator {
metadata_map.insert(
Cow::Borrowed("authors"),
serde_json::Value::Array(vec![serde_json::Value::String(creator.clone())]),
);
metadata_map.insert(Cow::Borrowed("created_by"), serde_json::Value::String(creator));
}
if let Some(initial_creator) = odt_props.initial_creator {
metadata_map.insert(
Cow::Borrowed("initial_creator"),
serde_json::Value::String(initial_creator),
);
}
if let Some(subject) = odt_props.subject {
metadata_map.insert(Cow::Borrowed("subject"), serde_json::Value::String(subject));
}
if let Some(keywords) = odt_props.keywords {
metadata_map.insert(Cow::Borrowed("keywords"), serde_json::Value::String(keywords));
}
if let Some(description) = odt_props.description {
metadata_map.insert(Cow::Borrowed("description"), serde_json::Value::String(description));
}
if let Some(creation_date) = odt_props.creation_date {
metadata_map.insert(Cow::Borrowed("created_at"), serde_json::Value::String(creation_date));
}
if let Some(date) = odt_props.date {
metadata_map.insert(Cow::Borrowed("modified_at"), serde_json::Value::String(date));
}
if let Some(language) = odt_props.language {
metadata_map.insert(Cow::Borrowed("language"), serde_json::Value::String(language));
}
if let Some(generator) = odt_props.generator {
metadata_map.insert(Cow::Borrowed("generator"), serde_json::Value::String(generator));
}
if let Some(editing_duration) = odt_props.editing_duration {
metadata_map.insert(
Cow::Borrowed("editing_duration"),
serde_json::Value::String(editing_duration),
);
}
if let Some(editing_cycles) = odt_props.editing_cycles {
metadata_map.insert(
Cow::Borrowed("editing_cycles"),
serde_json::Value::String(editing_cycles),
);
}
if let Some(page_count) = odt_props.page_count {
metadata_map.insert(
Cow::Borrowed("page_count"),
serde_json::Value::Number(page_count.into()),
);
}
if let Some(word_count) = odt_props.word_count {
metadata_map.insert(
Cow::Borrowed("word_count"),
serde_json::Value::Number(word_count.into()),
);
}
if let Some(character_count) = odt_props.character_count {
metadata_map.insert(
Cow::Borrowed("character_count"),
serde_json::Value::Number(character_count.into()),
);
}
if let Some(paragraph_count) = odt_props.paragraph_count {
metadata_map.insert(
Cow::Borrowed("paragraph_count"),
serde_json::Value::Number(paragraph_count.into()),
);
}
if let Some(table_count) = odt_props.table_count {
metadata_map.insert(
Cow::Borrowed("table_count"),
serde_json::Value::Number(table_count.into()),
);
}
if let Some(image_count) = odt_props.image_count {
metadata_map.insert(
Cow::Borrowed("image_count"),
serde_json::Value::Number(image_count.into()),
);
}
}
Ok(ExtractionResult {
content: text,
mime_type: mime_type.to_string().into(),
metadata: Metadata {
additional: metadata_map,
..Default::default()
},
pages: None,
tables,
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
})
}
fn supported_mime_types(&self) -> &[&str] {
&["application/vnd.oasis.opendocument.text"]
}
fn priority(&self) -> i32 {
60
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_odt_extractor_plugin_interface() {
let extractor = OdtExtractor::new();
assert_eq!(extractor.name(), "odt-extractor");
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
assert_eq!(extractor.priority(), 60);
assert_eq!(extractor.supported_mime_types().len(), 1);
}
#[tokio::test]
async fn test_odt_extractor_supports_odt() {
let extractor = OdtExtractor::new();
assert!(
extractor
.supported_mime_types()
.contains(&"application/vnd.oasis.opendocument.text")
);
}
#[tokio::test]
async fn test_odt_extractor_default() {
let extractor = OdtExtractor;
assert_eq!(extractor.name(), "odt-extractor");
}
#[tokio::test]
async fn test_odt_extractor_initialize_shutdown() {
let extractor = OdtExtractor::new();
assert!(extractor.initialize().is_ok());
assert!(extractor.shutdown().is_ok());
}
#[test]
fn test_extract_node_text_simple() {
let xml = r#"<p xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0">Hello world</p>"#;
let doc = roxmltree::Document::parse(xml).unwrap();
let node = doc.root_element();
let result = extract_node_text(node);
assert!(result.is_some());
assert!(!result.unwrap().is_empty());
}
}