#![warn(missing_docs, missing_debug_implementations, rust_2024_compatibility)]
#![deny(unsafe_code)]
#![cfg_attr(docsrs, feature(doc_cfg))]
pub mod batch;
pub mod converters;
#[cfg(feature = "docling-ffi")]
#[doc = "Document types and processing (docling-core compatible)"]
pub mod document;
pub mod engines;
pub mod error;
pub mod integration;
#[cfg(feature = "docling-ffi")]
pub mod ml;
pub mod optimization;
pub mod output;
pub mod pipeline; pub mod types;
pub mod utils;
pub use batch::{BatchProcessor, BatchResult};
pub use converters::{ConverterMetadata, DocumentConverter};
pub use error::{Result, TransmutationError};
pub use types::*;
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
#[derive(Debug)]
pub struct Converter {
config: ConverterConfig,
}
#[derive(Debug, Clone)]
pub struct ConverterConfig {
pub enable_cache: bool,
pub max_parallel: usize,
pub timeout: std::time::Duration,
}
impl Default for ConverterConfig {
fn default() -> Self {
Self {
enable_cache: true,
max_parallel: num_cpus::get(),
timeout: std::time::Duration::from_secs(300),
}
}
}
impl Converter {
pub fn new() -> Result<Self> {
Self::with_config(ConverterConfig::default())
}
pub fn with_config(config: ConverterConfig) -> Result<Self> {
tracing::info!("Initializing Transmutation v{}", VERSION);
Ok(Self { config })
}
pub fn config(&self) -> &ConverterConfig {
&self.config
}
pub fn convert<P: AsRef<std::path::Path>>(&self, input: P) -> ConversionBuilder {
ConversionBuilder::new(input.as_ref().to_path_buf())
}
}
impl Default for Converter {
fn default() -> Self {
Self::new().expect("Failed to create default converter")
}
}
#[derive(Debug)]
pub struct ConversionBuilder {
input: std::path::PathBuf,
output_format: Option<OutputFormat>,
options: ConversionOptions,
}
impl ConversionBuilder {
pub fn new(input: std::path::PathBuf) -> Self {
Self {
input,
output_format: None,
options: ConversionOptions::default(),
}
}
pub fn to(mut self, format: OutputFormat) -> Self {
self.output_format = Some(format);
self
}
pub fn with_options(mut self, options: ConversionOptions) -> Self {
self.options = options;
self
}
pub async fn execute(self) -> Result<ConversionResult> {
use crate::utils::detect_format;
let input_format = detect_format(&self.input).await?;
let output_format = self.output_format.unwrap_or(OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
});
if input_format == FileFormat::Pdf {
use crate::converters::pdf::PdfConverter;
let converter = PdfConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
if input_format == FileFormat::Html {
use crate::converters::html::HtmlConverter;
let converter = HtmlConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
if input_format == FileFormat::Xml {
use crate::converters::xml::XmlConverter;
let converter = XmlConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
if input_format == FileFormat::Zip
|| input_format == FileFormat::Tar
|| input_format == FileFormat::TarGz
|| input_format == FileFormat::TarBz2
|| input_format == FileFormat::SevenZ
{
use crate::converters::archive::ArchiveConverter;
let converter = ArchiveConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
#[cfg(feature = "office")]
if input_format == FileFormat::Docx {
use crate::converters::docx::DocxConverter;
let converter = DocxConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
#[cfg(feature = "office")]
if input_format == FileFormat::Xlsx {
use crate::converters::xlsx::XlsxConverter;
let converter = XlsxConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
#[cfg(feature = "office")]
if input_format == FileFormat::Pptx {
use crate::converters::pptx::PptxConverter;
let converter = PptxConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
if input_format == FileFormat::Txt {
use crate::converters::txt::TxtConverter;
let converter = TxtConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
if input_format == FileFormat::Csv {
use crate::converters::csv::CsvConverter;
let converter = CsvConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
if input_format == FileFormat::Tsv {
use crate::converters::csv::CsvConverter;
let converter = CsvConverter::new_tsv();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
if input_format == FileFormat::Rtf {
use crate::converters::rtf::RtfConverter;
let converter = RtfConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
if input_format == FileFormat::Odt {
use crate::converters::odt::OdtConverter;
let converter = OdtConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
#[cfg(feature = "image-ocr")]
if input_format.is_image() {
use crate::converters::image::ImageConverter;
let converter = ImageConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
#[cfg(feature = "audio")]
if input_format.is_audio() {
use crate::converters::audio::AudioConverter;
let converter = AudioConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
#[cfg(feature = "video")]
if input_format.is_video() {
use crate::converters::video::VideoConverter;
let converter = VideoConverter::new();
return converter
.convert(&self.input, output_format, self.options)
.await;
}
Err(TransmutationError::UnsupportedFormat(format!(
"Format {input_format:?} is not supported or feature not enabled"
)))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_converter_creation() {
let converter = Converter::new();
assert!(converter.is_ok());
}
#[test]
fn test_default_config() {
let config = ConverterConfig::default();
assert!(config.enable_cache);
assert!(config.max_parallel > 0);
}
}