html_to_markdown_rs/
lib.rs1pub mod converter;
11pub mod error;
12pub mod hocr;
13#[cfg(feature = "inline-images")]
14mod inline_images;
15pub mod options;
16pub mod sanitizer;
17pub mod text;
18pub mod wrapper;
19
20pub use error::{ConversionError, Result};
21#[cfg(feature = "inline-images")]
22pub use inline_images::{
23 HtmlExtraction, InlineImage, InlineImageConfig, InlineImageFormat, InlineImageSource, InlineImageWarning,
24};
25pub use options::{
26 CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
27 PreprocessingOptions, PreprocessingPreset, WhitespaceMode,
28};
29
30pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
47 let options = options.unwrap_or_default();
48
49 let normalized_html = html.replace("\r\n", "\n").replace('\r', "\n");
50
51 let clean_html = if options.preprocessing.enabled {
52 sanitizer::sanitize(&normalized_html, &options.preprocessing, &options.preserve_tags)?
53 } else {
54 normalized_html
55 };
56
57 let markdown = converter::convert_html(&clean_html, &options)?;
58
59 if options.wrap {
60 Ok(wrapper::wrap_markdown(&markdown, &options))
61 } else {
62 Ok(markdown)
63 }
64}
65
66#[cfg(feature = "inline-images")]
67pub fn convert_with_inline_images(
77 html: &str,
78 options: Option<ConversionOptions>,
79 image_cfg: InlineImageConfig,
80) -> Result<HtmlExtraction> {
81 use std::cell::RefCell;
82 use std::rc::Rc;
83
84 let options = options.unwrap_or_default();
85
86 let normalized_html = html.replace("\r\n", "\n").replace('\r', "\n");
87
88 let clean_html = if options.preprocessing.enabled {
89 sanitizer::sanitize(&normalized_html, &options.preprocessing, &options.preserve_tags)?
90 } else {
91 normalized_html
92 };
93
94 let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
95
96 let markdown = converter::convert_html_with_inline_collector(&clean_html, &options, Rc::clone(&collector))?;
97
98 let markdown = if options.wrap {
99 wrapper::wrap_markdown(&markdown, &options)
100 } else {
101 markdown
102 };
103
104 let collector = Rc::try_unwrap(collector)
105 .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
106 .into_inner();
107 let (inline_images, warnings) = collector.finish();
108
109 Ok(HtmlExtraction {
110 markdown,
111 inline_images,
112 warnings,
113 })
114}