1use std::borrow::Cow;
11
12pub mod converter;
13pub mod error;
14pub mod hocr;
15#[cfg(feature = "inline-images")]
16mod inline_images;
17#[cfg(feature = "metadata")]
18pub mod metadata;
19pub mod options;
20pub mod safety;
21pub mod text;
22pub mod wrapper;
23
24pub use error::{ConversionError, Result};
25#[cfg(feature = "inline-images")]
26pub use inline_images::{
27 HtmlExtraction, InlineImage, InlineImageConfig, InlineImageFormat, InlineImageSource, InlineImageWarning,
28};
29#[cfg(feature = "metadata")]
30pub use metadata::{
31 DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType, LinkMetadata, LinkType,
32 MetadataConfig, StructuredData, StructuredDataType, TextDirection,
33};
34pub use options::{
35 CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
36 PreprocessingOptions, PreprocessingPreset, WhitespaceMode,
37};
38
39pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
56 let options = options.unwrap_or_default();
57
58 let normalized_html = if html.contains('\r') {
59 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
60 } else {
61 Cow::Borrowed(html)
62 };
63
64 let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
65
66 if options.wrap {
67 Ok(wrapper::wrap_markdown(&markdown, &options))
68 } else {
69 Ok(markdown)
70 }
71}
72
73#[cfg(feature = "inline-images")]
74pub fn convert_with_inline_images(
84 html: &str,
85 options: Option<ConversionOptions>,
86 image_cfg: InlineImageConfig,
87) -> Result<HtmlExtraction> {
88 use std::cell::RefCell;
89 use std::rc::Rc;
90
91 let options = options.unwrap_or_default();
92
93 let normalized_html = if html.contains('\r') {
94 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
95 } else {
96 Cow::Borrowed(html)
97 };
98
99 let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
100
101 let markdown =
102 converter::convert_html_with_inline_collector(normalized_html.as_ref(), &options, Rc::clone(&collector))?;
103
104 let markdown = if options.wrap {
105 wrapper::wrap_markdown(&markdown, &options)
106 } else {
107 markdown
108 };
109
110 let collector = Rc::try_unwrap(collector)
111 .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
112 .into_inner();
113 let (inline_images, warnings) = collector.finish();
114
115 Ok(HtmlExtraction {
116 markdown,
117 inline_images,
118 warnings,
119 })
120}
121
122#[cfg(feature = "metadata")]
123pub fn convert_with_metadata(
151 html: &str,
152 options: Option<ConversionOptions>,
153 metadata_cfg: MetadataConfig,
154) -> Result<(String, ExtendedMetadata)> {
155 use std::cell::RefCell;
156 use std::rc::Rc;
157
158 let options = options.unwrap_or_default();
159
160 let normalized_html = if html.contains('\r') {
161 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
162 } else {
163 Cow::Borrowed(html)
164 };
165
166 let metadata_collector = Rc::new(RefCell::new(metadata::MetadataCollector::new(metadata_cfg)));
167
168 let markdown =
169 converter::convert_html_with_metadata(normalized_html.as_ref(), &options, Rc::clone(&metadata_collector))?;
170
171 let markdown = if options.wrap {
172 wrapper::wrap_markdown(&markdown, &options)
173 } else {
174 markdown
175 };
176
177 let metadata_collector = Rc::try_unwrap(metadata_collector)
178 .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
179 .into_inner();
180 let metadata = metadata_collector.finish();
181
182 Ok((markdown, metadata))
183}
184
185#[cfg(all(test, feature = "metadata"))]
186mod tests {
187 use super::*;
188
189 #[test]
190 fn test_convert_with_metadata_full_workflow() {
191 let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
192
193 let config = MetadataConfig {
194 extract_headers: true,
195 extract_links: true,
196 extract_images: true,
197 extract_structured_data: true,
198 max_structured_data_size: 1_000_000,
199 };
200
201 let (markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
202
203 assert!(!markdown.is_empty());
205 assert!(markdown.contains("Main Title"));
206 assert!(markdown.contains("Subsection"));
207
208 assert_eq!(metadata.document.language, Some("en".to_string()));
210
211 assert_eq!(metadata.headers.len(), 2);
213 assert_eq!(metadata.headers[0].level, 1);
214 assert_eq!(metadata.headers[0].text, "Main Title");
215 assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
216 assert_eq!(metadata.headers[1].level, 2);
217 assert_eq!(metadata.headers[1].text, "Subsection");
218
219 assert!(metadata.links.len() >= 2);
221 let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
222 assert!(external_link.is_some());
223 let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
224 assert!(anchor_link.is_some());
225
226 assert_eq!(metadata.images.len(), 1);
228 assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
229 assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
230 assert_eq!(metadata.images[0].image_type, ImageType::External);
231 }
232
233 #[test]
234 fn test_convert_with_metadata_document_fields() {
235 let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
236
237 let (_markdown, metadata) =
238 convert_with_metadata(html, None, MetadataConfig::default()).expect("conversion should succeed");
239
240 assert_eq!(
241 metadata.document.title,
242 Some("Test Article".to_string()),
243 "document: {:?}",
244 metadata.document
245 );
246 assert_eq!(metadata.document.description, Some("Desc".to_string()));
247 assert_eq!(metadata.document.author, Some("Author".to_string()));
248 assert_eq!(metadata.document.language, Some("en".to_string()));
249 assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
250 assert_eq!(
251 metadata.document.open_graph.get("description"),
252 Some(&"OG Desc".to_string())
253 );
254 }
255
256 #[test]
257 fn test_convert_with_metadata_empty_config() {
258 let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
259
260 let config = MetadataConfig {
261 extract_headers: false,
262 extract_links: false,
263 extract_images: false,
264 extract_structured_data: false,
265 max_structured_data_size: 0,
266 };
267
268 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
269
270 assert!(metadata.headers.is_empty());
272 assert!(metadata.links.is_empty());
273 assert!(metadata.images.is_empty());
274 assert_eq!(metadata.document.language, Some("en".to_string()));
276 }
277
278 #[test]
279 fn test_convert_with_metadata_data_uri_image() {
280 let html = "<html><body><img src=\"\" alt=\"Pixel\"></body></html>";
281
282 let config = MetadataConfig::default();
283
284 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
285
286 assert_eq!(metadata.images.len(), 1);
287 assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
288 assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
289 }
290
291 #[test]
292 fn test_convert_with_metadata_relative_paths() {
293 let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
294
295 let config = MetadataConfig::default();
296
297 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
298
299 let internal_links: Vec<_> = metadata
300 .links
301 .iter()
302 .filter(|l| l.link_type == LinkType::Internal)
303 .collect();
304 assert_eq!(internal_links.len(), 2);
305 }
306}