#![warn(missing_docs)]
pub mod converter;
pub mod detection;
pub mod error;
pub mod markdown;
pub(crate) mod zip_utils;
#[cfg(feature = "async")]
pub use converter::{AsyncConversionOptions, AsyncImageDescriber};
pub use converter::{
ConversionOptions, ConversionResult, ConversionWarning, Converter, ImageDescriber, WarningCode,
};
pub use error::ConvertError;
pub mod gemini {
#[cfg(not(target_arch = "wasm32"))]
pub use crate::converter::gemini::GeminiDescriber;
#[cfg(feature = "async-gemini")]
pub use crate::converter::gemini::AsyncGeminiDescriber;
}
#[cfg(feature = "wasm")]
mod wasm;
#[cfg(not(target_arch = "wasm32"))]
use std::path::Path;
#[cfg(not(target_arch = "wasm32"))]
pub fn convert_file(
path: impl AsRef<Path>,
options: &ConversionOptions,
) -> Result<ConversionResult, ConvertError> {
let path = path.as_ref();
let size = std::fs::metadata(path)?.len() as usize;
if size > options.max_input_bytes {
return Err(ConvertError::InputTooLarge {
size,
limit: options.max_input_bytes,
});
}
let data = std::fs::read(path)?;
if data.len() > options.max_input_bytes {
return Err(ConvertError::InputTooLarge {
size: data.len(),
limit: options.max_input_bytes,
});
}
let format = detection::detect_format(path, &data);
let (format, is_zip_magic) = match format {
Some("zip") => (detection::detect_zip_format(&data), true),
other => (other, false),
};
let extension = match format {
Some("code") => path.extension().and_then(|e| e.to_str()).unwrap_or("code"),
Some(fmt) => fmt,
None if is_zip_magic => {
return Err(ConvertError::UnsupportedFormat {
extension: "zip".to_string(),
});
}
None => path.extension().and_then(|e| e.to_str()).unwrap_or(""),
};
convert_bytes(&data, extension, options)
}
pub fn convert_bytes(
data: &[u8],
extension: &str,
options: &ConversionOptions,
) -> Result<ConversionResult, ConvertError> {
let extension_norm = normalize_extension(extension);
if data.len() > options.max_input_bytes {
return Err(ConvertError::InputTooLarge {
size: data.len(),
limit: options.max_input_bytes,
});
}
if extension_norm == "pdf" {
return Err(ConvertError::FormatNotSupported {
extension: "pdf".to_string(),
reason: "PDF is intentionally unsupported — Gemini, ChatGPT, and Claude \
handle PDF natively"
.to_string(),
});
}
use converter::code::CodeConverter;
use converter::csv::CsvConverter;
use converter::docx::DocxConverter;
use converter::html::HtmlConverter;
use converter::image::ImageConverter;
use converter::ipynb::IpynbConverter;
use converter::json::JsonConverter;
use converter::plain_text::PlainTextConverter;
use converter::pptx::PptxConverter;
use converter::xlsx::XlsxConverter;
use converter::xml::XmlConverter;
let code_conv = CodeConverter;
if code_conv.can_convert(&extension_norm, data) {
let result = code_conv.convert_with_extension(data, &extension_norm, options)?;
return enforce_strict_mode(result, options.strict);
}
let converters: Vec<Box<dyn Converter>> = vec![
Box::new(DocxConverter),
Box::new(PptxConverter),
Box::new(XlsxConverter),
Box::new(IpynbConverter),
Box::new(JsonConverter),
Box::new(XmlConverter),
Box::new(CsvConverter),
Box::new(HtmlConverter),
Box::new(ImageConverter),
Box::new(PlainTextConverter),
];
for conv in &converters {
if conv.can_convert(&extension_norm, data) {
let result = conv.convert(data, options)?;
return enforce_strict_mode(result, options.strict);
}
}
Err(ConvertError::UnsupportedFormat {
extension: extension_norm,
})
}
fn enforce_strict_mode(
result: ConversionResult,
strict: bool,
) -> Result<ConversionResult, ConvertError> {
if !strict || result.warnings.is_empty() {
return Ok(result);
}
let first = &result.warnings[0];
let loc = first
.location
.as_deref()
.map(|l| format!(" ({l})"))
.unwrap_or_default();
Err(ConvertError::MalformedDocument {
reason: format!(
"strict mode: encountered warning [{:?}] {}{}",
first.code, first.message, loc
),
})
}
fn normalize_extension(extension: &str) -> String {
extension
.trim()
.trim_start_matches('.')
.to_ascii_lowercase()
}
#[cfg(feature = "async")]
#[cfg(not(target_arch = "wasm32"))]
pub async fn convert_file_async(
path: impl AsRef<Path>,
options: &converter::AsyncConversionOptions,
) -> Result<ConversionResult, ConvertError> {
let path = path.as_ref();
let size = std::fs::metadata(path)?.len() as usize;
if size > options.base.max_input_bytes {
return Err(ConvertError::InputTooLarge {
size,
limit: options.base.max_input_bytes,
});
}
let data = std::fs::read(path)?;
if data.len() > options.base.max_input_bytes {
return Err(ConvertError::InputTooLarge {
size: data.len(),
limit: options.base.max_input_bytes,
});
}
let format = detection::detect_format(path, &data);
let (format, is_zip_magic) = match format {
Some("zip") => (detection::detect_zip_format(&data), true),
other => (other, false),
};
let extension = match format {
Some("code") => path.extension().and_then(|e| e.to_str()).unwrap_or("code"),
Some(fmt) => fmt,
None if is_zip_magic => {
return Err(ConvertError::UnsupportedFormat {
extension: "zip".to_string(),
});
}
None => path.extension().and_then(|e| e.to_str()).unwrap_or(""),
};
convert_bytes_async(&data, extension, options).await
}
#[cfg(feature = "async")]
pub async fn convert_bytes_async(
data: &[u8],
extension: &str,
options: &converter::AsyncConversionOptions,
) -> Result<ConversionResult, ConvertError> {
let extension_norm = normalize_extension(extension);
if data.len() > options.base.max_input_bytes {
return Err(ConvertError::InputTooLarge {
size: data.len(),
limit: options.base.max_input_bytes,
});
}
if extension_norm == "pdf" {
return Err(ConvertError::FormatNotSupported {
extension: "pdf".to_string(),
reason: "PDF is intentionally unsupported — Gemini, ChatGPT, and Claude \
handle PDF natively"
.to_string(),
});
}
if let Some(ref describer) = options.async_image_describer {
match extension_norm.as_str() {
"docx" => {
let conv = converter::docx::DocxConverter;
let (mut result, pending) = conv.convert_inner(data, &options.base)?;
if !pending.infos.is_empty() {
converter::ooxml_utils::resolve_image_placeholders_async(
&mut result.markdown,
&mut result.plain_text,
&pending.infos,
&pending.bytes,
describer.as_ref(),
&mut result.warnings,
)
.await;
}
return enforce_strict_mode(result, options.base.strict);
}
"pptx" => {
let conv = converter::pptx::PptxConverter;
let (mut result, pending) = conv.convert_inner(data, &options.base)?;
if !pending.infos.is_empty() {
converter::ooxml_utils::resolve_image_placeholders_async(
&mut result.markdown,
&mut result.plain_text,
&pending.infos,
&pending.bytes,
describer.as_ref(),
&mut result.warnings,
)
.await;
}
return enforce_strict_mode(result, options.base.strict);
}
"xlsx" | "xls" => {
let conv = converter::xlsx::XlsxConverter;
let (mut result, pending) = conv.convert_inner(data, &options.base)?;
if !pending.infos.is_empty() {
converter::ooxml_utils::resolve_image_placeholders_async(
&mut result.markdown,
&mut result.plain_text,
&pending.infos,
&pending.bytes,
describer.as_ref(),
&mut result.warnings,
)
.await;
}
return enforce_strict_mode(result, options.base.strict);
}
ext if converter::image::ImageConverter.can_convert(ext, data) => {
let conv = converter::image::ImageConverter;
let (mut result, pending) = conv.convert_inner(data, &options.base)?;
if !pending.infos.is_empty() {
converter::ooxml_utils::resolve_image_placeholders_async(
&mut result.markdown,
&mut result.plain_text,
&pending.infos,
&pending.bytes,
describer.as_ref(),
&mut result.warnings,
)
.await;
}
return enforce_strict_mode(result, options.base.strict);
}
_ => {}
}
}
convert_bytes(data, &extension_norm, &options.base)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_convert_bytes_input_too_large() {
let data = vec![0u8; 1024];
let options = ConversionOptions {
max_input_bytes: 512,
..Default::default()
};
let result = convert_bytes(&data, "txt", &options);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
format!("{err}").contains("input too large"),
"error was: {err}"
);
}
#[test]
fn test_convert_bytes_at_exact_limit_succeeds() {
let data = b"Hello, world!";
let options = ConversionOptions {
max_input_bytes: data.len(),
..Default::default()
};
let result = convert_bytes(data, "txt", &options);
assert!(result.is_ok());
}
#[test]
fn test_convert_bytes_strict_mode_escalates_warning() {
let data = b"caf\xe9";
let options = ConversionOptions {
strict: true,
..Default::default()
};
let result = convert_bytes(data, "txt", &options);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
format!("{err}").contains("strict mode"),
"error should mention strict mode: {err}"
);
}
#[test]
fn test_convert_bytes_non_strict_keeps_warning() {
let data = b"caf\xe9";
let result = convert_bytes(data, "txt", &ConversionOptions::default()).unwrap();
assert!(!result.warnings.is_empty(), "expected decoding warning");
}
#[test]
fn test_convert_bytes_extension_case_insensitive() {
let result = convert_bytes(b"hello world", " TXT ", &ConversionOptions::default()).unwrap();
assert!(result.markdown.contains("hello world"));
}
#[test]
fn test_convert_bytes_extension_with_leading_dot() {
let result = convert_bytes(b"hello world", ".txt", &ConversionOptions::default()).unwrap();
assert!(result.markdown.contains("hello world"));
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_convert_file_non_ooxml_zip_returns_unsupported() {
let mut buf = std::io::Cursor::new(Vec::new());
{
let mut zip_writer = zip::ZipWriter::new(&mut buf);
let options = zip::write::SimpleFileOptions::default();
zip_writer.start_file("hello.txt", options).unwrap();
std::io::Write::write_all(&mut zip_writer, b"hello world").unwrap();
zip_writer.finish().unwrap();
}
let zip_data = buf.into_inner();
let dir = std::env::temp_dir().join("anytomd_test_zip_misroute");
std::fs::create_dir_all(&dir).unwrap();
let file_path = dir.join("archive.txt");
std::fs::write(&file_path, &zip_data).unwrap();
let options = ConversionOptions::default();
let result = convert_file(&file_path, &options);
assert!(result.is_err(), "expected UnsupportedFormat error");
let err = result.unwrap_err();
assert!(
format!("{err}").contains("unsupported format"),
"error was: {err}"
);
let _ = std::fs::remove_dir_all(&dir);
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_convert_file_input_too_large() {
let path = std::path::Path::new("tests/fixtures/sample.csv");
if !path.exists() {
return; }
let file_size = std::fs::metadata(path).unwrap().len() as usize;
let options = ConversionOptions {
max_input_bytes: file_size.saturating_sub(1).max(1),
..Default::default()
};
let result = convert_file(path, &options);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
format!("{err}").contains("input too large"),
"error was: {err}"
);
}
#[test]
fn test_convert_bytes_pdf_returns_descriptive_error() {
let data = b"%PDF-1.7 fake pdf content";
let options = ConversionOptions::default();
let result = convert_bytes(data, "pdf", &options);
assert!(result.is_err());
let err = result.unwrap_err();
let msg = format!("{err}");
assert!(msg.contains("pdf"), "error should mention pdf: {msg}");
assert!(
msg.contains("intentionally unsupported"),
"error should explain why: {msg}"
);
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_convert_file_pdf_returns_descriptive_error() {
let dir = std::env::temp_dir().join("anytomd_test_pdf_error");
std::fs::create_dir_all(&dir).unwrap();
let file_path = dir.join("test.pdf");
std::fs::write(&file_path, b"%PDF-1.7 fake").unwrap();
let options = ConversionOptions::default();
let result = convert_file(&file_path, &options);
assert!(result.is_err());
let err = result.unwrap_err();
let msg = format!("{err}");
assert!(
msg.contains("intentionally unsupported"),
"error should explain why: {msg}"
);
let _ = std::fs::remove_dir_all(&dir);
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_convert_file_unknown_ext_json_with_long_leading_whitespace() {
let dir = std::env::temp_dir().join("anytomd_test_json_whitespace_detect");
std::fs::create_dir_all(&dir).unwrap();
let file_path = dir.join("payload.dat");
let mut data = vec![b' '; 40];
data.extend_from_slice(br#"{"k":1}"#);
std::fs::write(&file_path, data).unwrap();
let result = convert_file(&file_path, &ConversionOptions::default()).unwrap();
assert!(
result.markdown.contains("\"k\""),
"expected JSON conversion, markdown was: {}",
result.markdown
);
let _ = std::fs::remove_dir_all(&dir);
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_convert_file_unknown_ext_json_with_utf8_bom() {
let dir = std::env::temp_dir().join("anytomd_test_json_bom_detect");
std::fs::create_dir_all(&dir).unwrap();
let file_path = dir.join("payload.dat");
let mut data = vec![0xEF, 0xBB, 0xBF];
data.extend_from_slice(br#"{"k":1}"#);
std::fs::write(&file_path, data).unwrap();
let result = convert_file(&file_path, &ConversionOptions::default()).unwrap();
assert!(
result.markdown.contains("\"k\""),
"expected JSON conversion, markdown was: {}",
result.markdown
);
let _ = std::fs::remove_dir_all(&dir);
}
}