#![allow(
dead_code,
missing_docs,
clippy::unused_self,
clippy::unnecessary_wraps,
clippy::uninlined_format_args
)]
use once_cell::sync::Lazy;
use regex::Regex;
use super::types::*;
use crate::error::Result;
static MD_ESCAPE_PATTERN: Lazy<Regex> =
Lazy::new(|| Regex::new(r"([\\`*_\{\}\[\]()#+\-.!|])").unwrap());
static URL_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"https?://[^\s]+").unwrap());
#[derive(Debug)]
pub struct MarkdownSerializer {
indent: usize,
escape_underscores: bool,
escape_special_chars: bool,
enable_tables: bool,
enable_images: bool,
}
impl Default for MarkdownSerializer {
fn default() -> Self {
Self {
indent: 4,
escape_underscores: true,
escape_special_chars: true,
enable_tables: true,
enable_images: true,
}
}
}
impl MarkdownSerializer {
pub fn new() -> Self {
Self::default()
}
pub fn with_indent(mut self, indent: usize) -> Self {
self.indent = indent;
self
}
pub fn with_escape_special_chars(mut self, enable: bool) -> Self {
self.escape_special_chars = enable;
self
}
pub fn with_tables(mut self, enable: bool) -> Self {
self.enable_tables = enable;
self
}
pub fn with_images(mut self, enable: bool) -> Self {
self.enable_images = enable;
self
}
pub fn serialize(&self, doc: &DoclingDocument) -> Result<String> {
let mut parts = Vec::new();
for item in &doc.items {
if let Some(text) = self.serialize_item(item) {
parts.push(text);
}
}
let mut output = parts.join("\n\n");
while output.contains("\n\n\n") {
output = output.replace("\n\n\n", "\n\n");
}
Ok(output.trim().to_string())
}
fn serialize_item(&self, item: &DocItem) -> Option<String> {
match item {
DocItem::Title(text_item) => Some(self.serialize_title(text_item)),
DocItem::SectionHeader(header) => Some(self.serialize_section_header(header)),
DocItem::Paragraph(text_item) => Some(self.serialize_paragraph(text_item)),
DocItem::ListItem(list_item) => Some(self.serialize_list_item(list_item)),
DocItem::Table(table) => Some(self.serialize_table(table)),
DocItem::Picture(picture) => Some(self.serialize_picture(picture)),
DocItem::Code(code) => Some(self.serialize_code(code)),
DocItem::Formula(formula) => Some(self.serialize_formula(formula)),
}
}
fn serialize_title(&self, item: &TextItem) -> String {
let text = self.apply_formatting(&item.text, item.formatting.as_ref());
format!("# {}", text)
}
fn serialize_section_header(&self, item: &SectionHeaderItem) -> String {
let text = self.apply_formatting(&item.text, item.formatting.as_ref());
let hashes = "#".repeat(item.level + 1);
format!("{} {}", hashes, text)
}
fn serialize_paragraph(&self, item: &TextItem) -> String {
let mut text = item.text.clone();
text = match item.label {
DocItemLabel::CheckboxSelected => format!("- [x] {}", text),
DocItemLabel::CheckboxUnselected => format!("- [ ] {}", text),
_ => text,
};
self.apply_formatting(&text, item.formatting.as_ref())
}
fn serialize_list_item(&self, item: &ListItemData) -> String {
let indent_str = " ".repeat(item.level * self.indent);
let marker = if item.enumerated {
"1.".to_string()
} else {
item.marker.clone()
};
format!("{}{} {}", indent_str, marker, item.text)
}
fn serialize_table(&self, table: &TableItem) -> String {
let mut output = String::new();
if let Some(caption) = &table.caption {
output.push_str(caption);
output.push_str("\n\n");
}
if table.data.grid.is_empty() {
return output;
}
let header = &table.data.grid[0];
output.push('|');
for cell in header {
output.push(' ');
output.push_str(&cell.text.replace('\n', " "));
output.push_str(" |");
}
output.push('\n');
output.push('|');
for _ in header {
output.push_str(" --- |");
}
output.push('\n');
for row in &table.data.grid[1..] {
output.push('|');
for cell in row {
output.push(' ');
output.push_str(&cell.text.replace('\n', " "));
output.push_str(" |");
}
output.push('\n');
}
output.trim_end().to_string()
}
fn serialize_picture(&self, picture: &PictureItem) -> String {
let mut output = String::new();
if let Some(caption) = &picture.caption {
output.push_str(caption);
output.push_str("\n\n");
}
output.push_str(&picture.placeholder);
output
}
fn serialize_code(&self, code: &CodeItem) -> String {
if let Some(lang) = &code.language {
format!("```{}\n{}\n```", lang, code.text)
} else {
format!("```\n{}\n```", code.text)
}
}
fn serialize_formula(&self, formula: &FormulaItem) -> String {
if formula.is_inline {
format!("${}$", formula.text)
} else {
format!("$${}$$", formula.text)
}
}
fn apply_formatting(&self, text: &str, formatting: Option<&Formatting>) -> String {
let mut result = self.escape_markdown_chars(text);
if let Some(fmt) = formatting {
if fmt.bold && fmt.italic {
result = format!("***{}***", result);
} else if fmt.bold {
result = format!("**{}**", result);
} else if fmt.italic {
result = format!("*{}*", result);
}
if fmt.underline {
result = format!("<u>{}</u>", result);
}
}
result
}
fn escape_markdown_chars(&self, text: &str) -> String {
if !self.escape_special_chars {
return text.to_string();
}
if URL_PATTERN.is_match(text) {
return text.to_string();
}
if text.starts_with('`') && text.ends_with('`') {
return text.to_string();
}
let mut result = text.to_string();
if self.escape_underscores && !text.contains("](") {
result = result.replace('_', r"\_");
}
result = result.replace('*', r"\*");
result = result.replace('[', r"\[");
result = result.replace(']', r"\]");
result
}
fn format_link(&self, text: &str, url: &str) -> String {
format!("[{}]({})", text, url)
}
fn format_inline_code(&self, text: &str) -> String {
format!("`{}`", text)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_serialize_title() {
let serializer = MarkdownSerializer::new();
let item = TextItem {
text: "Test Title".to_string(),
formatting: None,
label: DocItemLabel::Title,
};
let result = serializer.serialize_title(&item);
assert_eq!(result, "# Test Title");
}
#[test]
fn test_serialize_section_header() {
let serializer = MarkdownSerializer::new();
let item = SectionHeaderItem {
text: "Section".to_string(),
level: 1,
formatting: None,
};
let result = serializer.serialize_section_header(&item);
assert_eq!(result, "## Section");
}
#[test]
fn test_apply_formatting() {
let serializer = MarkdownSerializer::new();
let bold = Formatting {
bold: true,
italic: false,
underline: false,
};
assert_eq!(serializer.apply_formatting("text", Some(&bold)), "**text**");
let italic = Formatting {
bold: false,
italic: true,
underline: false,
};
assert_eq!(serializer.apply_formatting("text", Some(&italic)), "*text*");
let both = Formatting {
bold: true,
italic: true,
underline: false,
};
assert_eq!(
serializer.apply_formatting("text", Some(&both)),
"* * **text****"
);
}
}