pub mod code;
pub mod csv;
pub mod docx;
pub mod gemini;
pub mod html;
pub mod image;
pub mod ipynb;
pub mod json;
pub(crate) mod ooxml_utils;
pub mod plain_text;
pub mod pptx;
pub mod xlsx;
pub mod xml;
use std::sync::Arc;
#[cfg(feature = "async")]
use std::future::Future;
#[cfg(feature = "async")]
use std::pin::Pin;
use crate::error::ConvertError;
pub trait ImageDescriber: Send + Sync {
fn describe(
&self,
image_bytes: &[u8],
mime_type: &str,
prompt: &str,
) -> Result<String, ConvertError>;
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum WarningCode {
SkippedElement,
UnsupportedFeature,
ResourceLimitReached,
MalformedSegment,
}
#[derive(Debug, Clone)]
pub struct ConversionWarning {
pub code: WarningCode,
pub message: String,
pub location: Option<String>,
}
#[derive(Clone)]
pub struct ConversionOptions {
pub extract_images: bool,
pub max_total_image_bytes: usize,
pub strict: bool,
pub max_input_bytes: usize,
pub max_uncompressed_zip_bytes: usize,
pub image_describer: Option<Arc<dyn ImageDescriber>>,
}
impl std::fmt::Debug for ConversionOptions {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ConversionOptions")
.field("extract_images", &self.extract_images)
.field("max_total_image_bytes", &self.max_total_image_bytes)
.field("strict", &self.strict)
.field("max_input_bytes", &self.max_input_bytes)
.field(
"max_uncompressed_zip_bytes",
&self.max_uncompressed_zip_bytes,
)
.field(
"image_describer",
&self.image_describer.as_ref().map(|_| ".."),
)
.finish()
}
}
impl Default for ConversionOptions {
fn default() -> Self {
Self {
extract_images: false,
max_total_image_bytes: 4_usize.saturating_mul(1024 * 1024 * 1024), strict: false,
max_input_bytes: 8_usize.saturating_mul(1024 * 1024 * 1024), max_uncompressed_zip_bytes: 16_usize.saturating_mul(1024 * 1024 * 1024), image_describer: None,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct ConversionResult {
pub markdown: String,
pub plain_text: String,
pub title: Option<String>,
pub images: Vec<(String, Vec<u8>)>,
pub warnings: Vec<ConversionWarning>,
}
pub(crate) fn decode_text(data: &[u8]) -> (String, Option<ConversionWarning>) {
if let Ok(text) = std::str::from_utf8(data) {
let text = text.strip_prefix('\u{FEFF}').unwrap_or(text);
return (text.to_string(), None);
}
if let Some((encoding, bom_len)) = encoding_rs::Encoding::for_bom(data) {
let (decoded, _enc, had_errors) = encoding.decode(&data[bom_len..]);
let warning = if had_errors {
ConversionWarning {
code: WarningCode::MalformedSegment,
message: format!(
"replacement characters inserted during {} decoding",
encoding.name()
),
location: None,
}
} else {
ConversionWarning {
code: WarningCode::UnsupportedFeature,
message: format!("decoded from {} encoding", encoding.name()),
location: None,
}
};
return (decoded.into_owned(), Some(warning));
}
let (decoded, _enc, had_errors) = encoding_rs::WINDOWS_1252.decode(data);
let warning = if had_errors {
ConversionWarning {
code: WarningCode::MalformedSegment,
message: "replacement characters inserted during windows-1252 decoding".to_string(),
location: None,
}
} else {
ConversionWarning {
code: WarningCode::UnsupportedFeature,
message: "decoded from windows-1252 encoding (fallback)".to_string(),
location: None,
}
};
(decoded.into_owned(), Some(warning))
}
pub(crate) fn mime_from_image(filename: &str, data: &[u8]) -> &'static str {
if data.len() >= 8 {
if data.starts_with(&[0x89, b'P', b'N', b'G', 0x0D, 0x0A, 0x1A, 0x0A]) {
return "image/png";
}
if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
return "image/jpeg";
}
if data.starts_with(b"GIF87a") || data.starts_with(b"GIF89a") {
return "image/gif";
}
if data.starts_with(b"RIFF") && data.len() >= 12 && &data[8..12] == b"WEBP" {
return "image/webp";
}
}
let ext = filename
.rsplit('.')
.next()
.unwrap_or("")
.to_ascii_lowercase();
match ext.as_str() {
"png" => "image/png",
"jpg" | "jpeg" => "image/jpeg",
"gif" => "image/gif",
"webp" => "image/webp",
"bmp" => "image/bmp",
"tiff" | "tif" => "image/tiff",
"svg" => "image/svg+xml",
"heic" | "heif" => "image/heic",
"avif" => "image/avif",
_ => "application/octet-stream",
}
}
pub(crate) fn replace_image_alt_by_placeholder(
markdown: &str,
placeholder: &str,
description: &str,
filename: &str,
) -> String {
let target = format!("");
let replacement = format!("");
if let Some(pos) = markdown.find(&target) {
let mut result = String::with_capacity(markdown.len());
result.push_str(&markdown[..pos]);
result.push_str(&replacement);
result.push_str(&markdown[pos + target.len()..]);
result
} else {
markdown.to_string()
}
}
#[cfg(all(feature = "async", not(target_arch = "wasm32")))]
pub type AsyncDescribeFuture<'a> =
Pin<Box<dyn Future<Output = Result<String, ConvertError>> + Send + 'a>>;
#[cfg(all(feature = "async", target_arch = "wasm32"))]
pub type AsyncDescribeFuture<'a> = Pin<Box<dyn Future<Output = Result<String, ConvertError>> + 'a>>;
#[cfg(all(feature = "async", not(target_arch = "wasm32")))]
pub trait AsyncImageDescriber: Send + Sync {
fn describe<'a>(
&'a self,
image_bytes: &'a [u8],
mime_type: &'a str,
prompt: &'a str,
) -> AsyncDescribeFuture<'a>;
}
#[cfg(all(feature = "async", target_arch = "wasm32"))]
pub trait AsyncImageDescriber {
fn describe<'a>(
&'a self,
image_bytes: &'a [u8],
mime_type: &'a str,
prompt: &'a str,
) -> AsyncDescribeFuture<'a>;
}
#[cfg(feature = "async")]
#[derive(Default)]
pub struct AsyncConversionOptions {
pub base: ConversionOptions,
pub async_image_describer: Option<Arc<dyn AsyncImageDescriber>>,
}
#[cfg(feature = "async")]
impl std::fmt::Debug for AsyncConversionOptions {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("AsyncConversionOptions")
.field("base", &self.base)
.field(
"async_image_describer",
&self.async_image_describer.as_ref().map(|_| ".."),
)
.finish()
}
}
pub trait Converter {
fn supported_extensions(&self) -> &[&str];
fn can_convert(&self, extension: &str, _header_bytes: &[u8]) -> bool {
self.supported_extensions().contains(&extension)
}
fn convert(
&self,
data: &[u8],
options: &ConversionOptions,
) -> Result<ConversionResult, ConvertError>;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_decode_text_utf8_passthrough() {
let (text, warning) = decode_text(b"Hello, world!");
assert_eq!(text, "Hello, world!");
assert!(warning.is_none());
}
#[test]
fn test_decode_text_utf8_bom_stripped() {
let mut input = vec![0xEF, 0xBB, 0xBF]; input.extend_from_slice(b"BOM content");
let (text, warning) = decode_text(&input);
assert_eq!(text, "BOM content");
assert!(warning.is_none());
}
#[test]
fn test_decode_text_utf16_le_bom() {
let input: Vec<u8> = vec![0xFF, 0xFE, b'A', 0x00, b'B', 0x00];
let (text, warning) = decode_text(&input);
assert_eq!(text, "AB");
assert!(warning.is_some());
}
#[test]
fn test_decode_text_utf16_be_bom() {
let input: Vec<u8> = vec![0xFE, 0xFF, 0x00, b'A', 0x00, b'B'];
let (text, warning) = decode_text(&input);
assert_eq!(text, "AB");
assert!(warning.is_some());
}
#[test]
fn test_decode_text_windows_1252_fallback() {
let input = b"caf\xe9";
let (text, warning) = decode_text(input);
assert_eq!(text, "café");
assert!(warning.is_some());
let w = warning.unwrap();
assert_eq!(w.code, WarningCode::UnsupportedFeature);
}
#[test]
fn test_decode_text_cjk_utf8() {
let input = "한êµì–´ 䏿–‡ 日本語".as_bytes();
let (text, warning) = decode_text(input);
assert_eq!(text, "한êµì–´ 䏿–‡ 日本語");
assert!(warning.is_none());
}
#[test]
fn test_mime_from_image_png_magic_bytes() {
let png_header = [0x89, b'P', b'N', b'G', 0x0D, 0x0A, 0x1A, 0x0A];
assert_eq!(mime_from_image("image.png", &png_header), "image/png");
assert_eq!(mime_from_image("image.jpg", &png_header), "image/png");
}
#[test]
fn test_mime_from_image_jpeg_magic_bytes() {
let jpeg_header = [0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46];
assert_eq!(mime_from_image("photo.jpg", &jpeg_header), "image/jpeg");
}
#[test]
fn test_mime_from_image_gif_magic_bytes() {
assert_eq!(mime_from_image("anim.gif", b"GIF89a.."), "image/gif");
assert_eq!(mime_from_image("old.gif", b"GIF87a.."), "image/gif");
}
#[test]
fn test_mime_from_image_webp_magic_bytes() {
let webp = b"RIFF\x00\x00\x00\x00WEBP";
assert_eq!(mime_from_image("photo.webp", webp), "image/webp");
}
#[test]
fn test_mime_from_image_extension_fallback() {
let empty = b"unknown";
assert_eq!(mime_from_image("file.png", empty), "image/png");
assert_eq!(mime_from_image("file.jpg", empty), "image/jpeg");
assert_eq!(mime_from_image("file.jpeg", empty), "image/jpeg");
assert_eq!(mime_from_image("file.gif", empty), "image/gif");
assert_eq!(mime_from_image("file.webp", empty), "image/webp");
assert_eq!(mime_from_image("file.bmp", empty), "image/bmp");
assert_eq!(mime_from_image("file.tiff", empty), "image/tiff");
assert_eq!(mime_from_image("file.svg", empty), "image/svg+xml");
assert_eq!(mime_from_image("file.heic", empty), "image/heic");
assert_eq!(mime_from_image("file.heif", empty), "image/heic");
assert_eq!(mime_from_image("file.avif", empty), "image/avif");
assert_eq!(
mime_from_image("file.xyz", empty),
"application/octet-stream"
);
}
#[test]
fn test_conversion_options_default_has_no_describer() {
let opts = ConversionOptions::default();
assert!(opts.image_describer.is_none());
}
#[test]
fn test_conversion_options_debug_format() {
let opts = ConversionOptions::default();
let debug = format!("{:?}", opts);
assert!(debug.contains("ConversionOptions"));
assert!(debug.contains("image_describer: None"));
}
#[test]
fn test_conversion_options_clone_with_describer() {
use crate::error::ConvertError;
struct MockDescriber;
impl ImageDescriber for MockDescriber {
fn describe(
&self,
_image_bytes: &[u8],
_mime_type: &str,
_prompt: &str,
) -> Result<String, ConvertError> {
Ok("mock".to_string())
}
}
let opts = ConversionOptions {
image_describer: Some(Arc::new(MockDescriber)),
..Default::default()
};
let cloned = opts.clone();
assert!(cloned.image_describer.is_some());
}
#[test]
fn test_replace_image_alt_placeholder_match() {
let md = "";
let result = replace_image_alt_by_placeholder(md, "__img_0__", "A cute cat", "cat.png");
assert_eq!(result, "");
}
#[test]
fn test_replace_image_alt_placeholder_no_match() {
let md = "";
let result = replace_image_alt_by_placeholder(md, "__img_99__", "description", "cat.png");
assert_eq!(result, md);
}
#[test]
fn test_replace_image_alt_placeholder_only_first_occurrence() {
let md = " and ";
let result = replace_image_alt_by_placeholder(md, "__img_0__", "A cat", "cat.png");
assert_eq!(result, " and ");
}
#[test]
fn test_replace_image_alt_placeholder_same_filename_different_placeholders() {
let md = "\n";
let result = replace_image_alt_by_placeholder(md, "__img_1__", "Second logo", "logo.png");
assert!(result.contains(""));
assert!(result.contains(""));
}
#[test]
fn test_conversion_options_debug_with_describer() {
use crate::error::ConvertError;
struct MockDescriber;
impl ImageDescriber for MockDescriber {
fn describe(
&self,
_image_bytes: &[u8],
_mime_type: &str,
_prompt: &str,
) -> Result<String, ConvertError> {
Ok("mock".to_string())
}
}
let opts = ConversionOptions {
image_describer: Some(Arc::new(MockDescriber)),
..Default::default()
};
let debug = format!("{:?}", opts);
assert!(debug.contains("image_describer: Some"));
}
}