pub mod adapters;
pub mod converter;
pub mod core;
pub mod error;
pub mod localization;
pub mod render;
pub use converter::DocxToMarkdown;
pub use error::{Error, Result};
pub use localization::parse_heading_style;
pub type Converter = DocxToMarkdown;
use std::path::{Path, PathBuf};
#[derive(Debug, Clone)]
pub struct ConvertOptions {
pub image_handling: ImageHandling,
pub preserve_whitespace: bool,
pub html_underline: bool,
pub html_strikethrough: bool,
pub strict_reference_validation: bool,
}
impl Default for ConvertOptions {
fn default() -> Self {
Self {
image_handling: ImageHandling::Inline,
preserve_whitespace: false,
html_underline: true,
html_strikethrough: false,
strict_reference_validation: false,
}
}
}
#[derive(Debug, Clone)]
pub enum ImageHandling {
SaveToDir(PathBuf),
Inline,
Skip,
}
#[derive(Debug, Clone)]
pub struct Builder {
options: ConvertOptions,
}
impl Builder {
pub fn new() -> Self {
Self {
options: ConvertOptions::default(),
}
}
pub fn skip_images(mut self) -> Self {
self.options.image_handling = ImageHandling::Skip;
self
}
pub fn inline_images(mut self) -> Self {
self.options.image_handling = ImageHandling::Inline;
self
}
pub fn save_images_to(mut self, dir: impl Into<PathBuf>) -> Self {
self.options.image_handling = ImageHandling::SaveToDir(dir.into());
self
}
pub fn preserve_whitespace(mut self) -> Self {
self.options.preserve_whitespace = true;
self
}
pub fn html_underline(mut self, enabled: bool) -> Self {
self.options.html_underline = enabled;
self
}
pub fn html_strikethrough(mut self, enabled: bool) -> Self {
self.options.html_strikethrough = enabled;
self
}
pub fn strict(mut self) -> Self {
self.options.strict_reference_validation = true;
self
}
pub fn build(self) -> Converter {
Converter::new(self.options)
}
pub fn convert(self, path: impl AsRef<Path>) -> Result<String> {
self.build().convert(path)
}
pub fn convert_bytes(self, bytes: &[u8]) -> Result<String> {
self.build().convert_bytes(bytes)
}
pub fn convert_reader(self, reader: impl std::io::Read + std::io::Seek) -> Result<String> {
self.build().convert_reader(reader)
}
}
impl Default for Builder {
fn default() -> Self {
Self::new()
}
}
pub fn convert(path: impl AsRef<Path>) -> Result<String> {
Converter::with_defaults().convert(path)
}
pub fn convert_bytes(bytes: &[u8]) -> Result<String> {
Converter::with_defaults().convert_bytes(bytes)
}
pub fn convert_reader(reader: impl std::io::Read + std::io::Seek) -> Result<String> {
Converter::with_defaults().convert_reader(reader)
}
pub fn builder() -> Builder {
Builder::new()
}
#[cfg(feature = "python")]
mod python_bindings {
use super::*;
use pyo3::prelude::*;
use pyo3::types::PyBytes;
#[pyfunction]
#[pyo3(signature = (input, *, image_handling=None, preserve_whitespace=None, html_underline=None, html_strikethrough=None, strict_reference_validation=None))]
fn convert_docx(
input: &Bound<'_, PyAny>,
image_handling: Option<String>,
preserve_whitespace: Option<bool>,
html_underline: Option<bool>,
html_strikethrough: Option<bool>,
strict_reference_validation: Option<bool>,
) -> PyResult<String> {
let mut options = ConvertOptions::default();
if let Some(handling) = image_handling {
options.image_handling = match handling.as_str() {
"inline" => ImageHandling::Inline,
"skip" => ImageHandling::Skip,
path => ImageHandling::SaveToDir(PathBuf::from(path)),
};
}
if let Some(v) = preserve_whitespace {
options.preserve_whitespace = v;
}
if let Some(v) = html_underline {
options.html_underline = v;
}
if let Some(v) = html_strikethrough {
options.html_strikethrough = v;
}
if let Some(v) = strict_reference_validation {
options.strict_reference_validation = v;
}
let converter = DocxToMarkdown::new(options);
if let Ok(path) = input.extract::<String>() {
converter
.convert(&path)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string()))
} else if let Ok(bytes) = input.downcast::<PyBytes>() {
converter
.convert_bytes(bytes.as_bytes())
.map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string()))
} else {
Err(PyErr::new::<pyo3::exceptions::PyTypeError, _>(
"Expected string path or bytes",
))
}
}
#[pymodule]
pub fn undocx(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(convert_docx, m)?)?;
Ok(())
}
}