html_to_markdown_rs/
lib.rs1use std::borrow::Cow;
11
12pub mod converter;
13pub mod error;
14pub mod hocr;
15#[cfg(feature = "inline-images")]
16mod inline_images;
17pub mod options;
18pub mod text;
19pub mod wrapper;
20
21pub use error::{ConversionError, Result};
22#[cfg(feature = "inline-images")]
23pub use inline_images::{
24 HtmlExtraction, InlineImage, InlineImageConfig, InlineImageFormat, InlineImageSource, InlineImageWarning,
25};
26pub use options::{
27 CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
28 PreprocessingOptions, PreprocessingPreset, WhitespaceMode,
29};
30
31pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
48 let options = options.unwrap_or_default();
49
50 let normalized_html = if html.contains('\r') {
51 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
52 } else {
53 Cow::Borrowed(html)
54 };
55
56 let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
57
58 if options.wrap {
59 Ok(wrapper::wrap_markdown(&markdown, &options))
60 } else {
61 Ok(markdown)
62 }
63}
64
65#[cfg(feature = "inline-images")]
66pub fn convert_with_inline_images(
76 html: &str,
77 options: Option<ConversionOptions>,
78 image_cfg: InlineImageConfig,
79) -> Result<HtmlExtraction> {
80 use std::cell::RefCell;
81 use std::rc::Rc;
82
83 let options = options.unwrap_or_default();
84
85 let normalized_html = if html.contains('\r') {
86 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
87 } else {
88 Cow::Borrowed(html)
89 };
90
91 let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
92
93 let markdown =
94 converter::convert_html_with_inline_collector(normalized_html.as_ref(), &options, Rc::clone(&collector))?;
95
96 let markdown = if options.wrap {
97 wrapper::wrap_markdown(&markdown, &options)
98 } else {
99 markdown
100 };
101
102 let collector = Rc::try_unwrap(collector)
103 .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
104 .into_inner();
105 let (inline_images, warnings) = collector.finish();
106
107 Ok(HtmlExtraction {
108 markdown,
109 inline_images,
110 warnings,
111 })
112}