#![allow(clippy::unused_self, clippy::uninlined_format_args)]
use std::path::{Path, PathBuf};
use std::time::Instant;
use async_trait::async_trait;
use super::traits::{ConverterMetadata, DocumentConverter};
use crate::Result;
use crate::types::{
ConversionOptions, ConversionOutput, ConversionResult, ConversionStatistics, DocumentMetadata,
FileFormat, OutputFormat, OutputMetadata,
};
#[derive(Debug)]
pub struct DocxConverter;
impl DocxConverter {
pub fn new() -> Self {
Self
}
#[cfg(feature = "pdf-to-image")]
async fn convert_to_images(
&self,
path: &Path,
format: crate::types::ImageFormat,
_quality: u8,
dpi: u32,
_options: &ConversionOptions,
) -> Result<Vec<ConversionOutput>> {
use std::process::Command;
use tokio::fs;
eprintln!("🖼️ Converting DOCX to images (DOCX → PDF → Images)...");
let temp_dir =
std::env::temp_dir().join(format!("transmutation_docx_{}", std::process::id()));
fs::create_dir_all(&temp_dir).await?;
eprintln!(" [1/2] DOCX → PDF (LibreOffice --headless)...");
let (libreoffice_cmd, install_msg) = if cfg!(target_os = "windows") {
(
"soffice.exe",
"Install LibreOffice from https://www.libreoffice.org/download/",
)
} else if cfg!(target_os = "macos") {
(
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
"Install: brew install libreoffice",
)
} else {
("libreoffice", "Install: sudo apt install libreoffice")
};
let output = Command::new(libreoffice_cmd)
.arg("--headless")
.arg("--convert-to")
.arg("pdf")
.arg("--outdir")
.arg(&temp_dir)
.arg(path)
.output()
.map_err(|e| {
crate::TransmutationError::engine_error(
"libreoffice",
format!("Failed to run LibreOffice: {}.\n{}", e, install_msg),
)
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let _ = fs::remove_dir_all(&temp_dir).await;
return Err(crate::TransmutationError::engine_error(
"libreoffice",
format!("LibreOffice failed: {}", stderr),
));
}
let filename = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("document");
let pdf_path = temp_dir.join(format!("{}.pdf", filename));
if !pdf_path.exists() {
let _ = fs::remove_dir_all(&temp_dir).await;
return Err(crate::TransmutationError::engine_error(
"libreoffice",
"PDF not generated by LibreOffice".to_string(),
));
}
let pdf_size = pdf_path.metadata()?.len();
eprintln!(" ✓ PDF: {} KB", pdf_size / 1024);
eprintln!(" [2/2] PDF → Images (pdftoppm @ {} DPI)...", dpi);
let format_flag = match format {
crate::types::ImageFormat::Png => "png",
crate::types::ImageFormat::Jpeg => "jpeg",
crate::types::ImageFormat::Webp => "png",
};
let (pdftoppm_cmd, pdftoppm_install) = if cfg!(target_os = "windows") {
("pdftoppm.exe", "Install poppler: choco install poppler")
} else if cfg!(target_os = "macos") {
("pdftoppm", "Install: brew install poppler")
} else {
("pdftoppm", "Install: sudo apt install poppler-utils")
};
let output = Command::new(pdftoppm_cmd)
.arg(format!("-{}", format_flag))
.arg("-r")
.arg(dpi.to_string())
.arg(&pdf_path)
.arg(temp_dir.join("page"))
.output()
.map_err(|e| {
let _ = std::fs::remove_dir_all(&temp_dir);
crate::TransmutationError::engine_error(
"pdftoppm",
format!("Failed: {}.\n{}", e, pdftoppm_install),
)
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let _ = fs::remove_dir_all(&temp_dir).await;
return Err(crate::TransmutationError::engine_error(
"pdftoppm",
format!("pdftoppm failed: {}", stderr),
));
}
let mut outputs = Vec::new();
let mut entries = fs::read_dir(&temp_dir).await?;
let mut image_files = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let entry_path = entry.path();
if entry_path.extension().and_then(|e| e.to_str()) == Some(format_flag) {
image_files.push(entry_path);
}
}
image_files.sort();
eprintln!(" ✓ Rendered {} pages", image_files.len());
for (idx, image_path) in image_files.iter().enumerate() {
let image_data = fs::read(&image_path).await?;
let size_bytes = image_data.len() as u64;
outputs.push(ConversionOutput {
page_number: idx + 1,
data: image_data,
metadata: OutputMetadata {
size_bytes,
chunk_count: 1,
token_count: None,
},
});
}
let _ = fs::remove_dir_all(&temp_dir).await;
eprintln!("✅ DOCX → {} images complete!", outputs.len());
Ok(outputs)
}
#[cfg(feature = "office")]
async fn convert_to_markdown(
&self,
path: &Path,
options: &ConversionOptions,
) -> Result<Vec<ConversionOutput>> {
eprintln!("📄 Reading DOCX file with docx-rs...");
let file_data = tokio::fs::read(path).await?;
let docx = docx_rs::read_docx(&file_data).map_err(|e| {
crate::TransmutationError::engine_error(
"docx-rs",
format!("Failed to parse DOCX: {:?}", e),
)
})?;
eprintln!("✓ DOCX parsed successfully");
let mut all_paragraphs = Vec::new();
for child in &docx.document.children {
let text = self.extract_text_from_child(child);
if !text.is_empty() {
all_paragraphs.push(text);
}
}
if options.split_pages && all_paragraphs.len() > 15 {
eprintln!("📄 Splitting DOCX into logical pages (chunks)...");
let paragraphs_per_page = 15;
let mut outputs = Vec::new();
for (chunk_idx, chunk) in all_paragraphs.chunks(paragraphs_per_page).enumerate() {
let mut markdown = chunk.join("\n\n");
while markdown.contains("\n\n\n") {
markdown = markdown.replace("\n\n\n", "\n\n");
}
let markdown = markdown.trim().to_string();
let token_count = markdown.len() / 4;
let data = markdown.into_bytes();
let size_bytes = data.len() as u64;
outputs.push(ConversionOutput {
page_number: chunk_idx + 1,
data,
metadata: OutputMetadata {
size_bytes,
chunk_count: 1,
token_count: Some(token_count),
},
});
}
eprintln!("✓ Split into {} logical pages", outputs.len());
return Ok(outputs);
}
let markdown = all_paragraphs.join("\n\n");
let mut markdown = markdown;
while markdown.contains("\n\n\n") {
markdown = markdown.replace("\n\n\n", "\n\n");
}
let markdown = markdown.trim().to_string();
eprintln!("✓ Converted to Markdown: {} chars", markdown.len());
let token_count = markdown.len() / 4;
let data = markdown.into_bytes();
let size_bytes = data.len() as u64;
Ok(vec![ConversionOutput {
page_number: 0,
data,
metadata: OutputMetadata {
size_bytes,
chunk_count: 1,
token_count: Some(token_count),
},
}])
}
#[cfg(feature = "office")]
fn extract_text_from_child(&self, child: &docx_rs::DocumentChild) -> String {
use docx_rs::DocumentChild;
match child {
DocumentChild::Paragraph(para) => self.extract_paragraph_text(para),
DocumentChild::Table(table) => self.extract_table_text(table),
_ => String::new(),
}
}
#[cfg(feature = "office")]
fn extract_paragraph_text(&self, para: &docx_rs::Paragraph) -> String {
use docx_rs::ParagraphChild;
let mut text = String::new();
for child in ¶.children {
if let ParagraphChild::Run(run) = child {
for run_child in &run.children {
if let docx_rs::RunChild::Text(t) = run_child {
text.push_str(&t.text);
}
}
}
}
text.trim().to_string()
}
#[cfg(feature = "office")]
fn extract_table_text(&self, _table: &docx_rs::Table) -> String {
String::from("[Table content]")
}
}
impl Default for DocxConverter {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl DocumentConverter for DocxConverter {
fn supported_formats(&self) -> Vec<FileFormat> {
vec![FileFormat::Docx]
}
fn output_formats(&self) -> Vec<OutputFormat> {
vec![OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
}]
}
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
options: ConversionOptions,
) -> Result<ConversionResult> {
let start_time = Instant::now();
let input_size = tokio::fs::metadata(input).await?.len();
let content = match output_format {
OutputFormat::Markdown { .. } => {
#[cfg(feature = "office")]
{
self.convert_to_markdown(input, &options).await?
}
#[cfg(not(feature = "office"))]
{
return Err(crate::TransmutationError::InvalidOptions(
"DOCX conversion requires office feature".to_string(),
));
}
}
OutputFormat::Image {
format: _format,
quality: _quality,
dpi: _dpi,
} => {
#[cfg(feature = "pdf-to-image")]
{
self.convert_to_images(input, _format, _quality, _dpi, &options)
.await?
}
#[cfg(not(feature = "pdf-to-image"))]
{
return Err(crate::TransmutationError::InvalidOptions(
"DOCX to image requires pdf-to-image feature (uses LibreOffice + pdftoppm)"
.to_string(),
));
}
}
_ => {
return Err(crate::TransmutationError::InvalidOptions(format!(
"Unsupported output format for DOCX: {:?}",
output_format
)));
}
};
let output_size: u64 = content.iter().map(|c| c.metadata.size_bytes).sum();
let metadata = DocumentMetadata {
title: None, author: None,
created: None,
modified: None,
page_count: 1, language: None,
custom: std::collections::HashMap::new(),
};
let duration = start_time.elapsed();
let statistics = ConversionStatistics {
input_size_bytes: input_size,
output_size_bytes: output_size,
duration,
pages_processed: 1,
tables_extracted: 0, images_extracted: 0,
cache_hit: false,
};
Ok(ConversionResult {
input_path: PathBuf::from(input),
input_format: FileFormat::Docx,
output_format,
content,
metadata,
statistics,
})
}
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "DOCX Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "Pure Rust DOCX to Markdown converter".to_string(),
external_deps: vec!["docx-rs".to_string()],
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_docx_converter_creation() {
let converter = DocxConverter::new();
assert_eq!(converter.supported_formats(), vec![FileFormat::Docx]);
}
#[test]
fn test_docx_converter_metadata() {
let converter = DocxConverter::new();
let meta = converter.metadata();
assert_eq!(meta.name, "DOCX Converter");
assert!(meta.external_deps.contains(&"docx-rs".to_string()));
}
}