1use std::borrow::Cow;
11
12pub mod converter;
13pub mod error;
14pub mod hocr;
15#[cfg(feature = "inline-images")]
16mod inline_images;
17#[cfg(feature = "metadata")]
18pub mod metadata;
19pub mod options;
20pub mod safety;
21pub mod text;
22pub mod wrapper;
23
24pub use error::{ConversionError, Result};
25#[cfg(feature = "inline-images")]
26pub use inline_images::{
27 HtmlExtraction, InlineImage, InlineImageConfig, InlineImageFormat, InlineImageSource, InlineImageWarning,
28};
29#[cfg(feature = "metadata")]
30pub use metadata::{
31 DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType,
32 LinkMetadata, LinkType, MetadataConfig, StructuredData, StructuredDataType, TextDirection,
33};
34pub use options::{
35 CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
36 PreprocessingOptions, PreprocessingPreset, WhitespaceMode,
37};
38
39pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
56 let options = options.unwrap_or_default();
57
58 let normalized_html = if html.contains('\r') {
59 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
60 } else {
61 Cow::Borrowed(html)
62 };
63
64 let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
65
66 if options.wrap {
67 Ok(wrapper::wrap_markdown(&markdown, &options))
68 } else {
69 Ok(markdown)
70 }
71}
72
73#[cfg(feature = "inline-images")]
74pub fn convert_with_inline_images(
84 html: &str,
85 options: Option<ConversionOptions>,
86 image_cfg: InlineImageConfig,
87) -> Result<HtmlExtraction> {
88 use std::cell::RefCell;
89 use std::rc::Rc;
90
91 let options = options.unwrap_or_default();
92
93 let normalized_html = if html.contains('\r') {
94 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
95 } else {
96 Cow::Borrowed(html)
97 };
98
99 let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
100
101 let markdown =
102 converter::convert_html_with_inline_collector(normalized_html.as_ref(), &options, Rc::clone(&collector))?;
103
104 let markdown = if options.wrap {
105 wrapper::wrap_markdown(&markdown, &options)
106 } else {
107 markdown
108 };
109
110 let collector = Rc::try_unwrap(collector)
111 .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
112 .into_inner();
113 let (inline_images, warnings) = collector.finish();
114
115 Ok(HtmlExtraction {
116 markdown,
117 inline_images,
118 warnings,
119 })
120}
121
122#[cfg(feature = "metadata")]
123pub fn convert_with_metadata(
241 html: &str,
242 options: Option<ConversionOptions>,
243 metadata_cfg: MetadataConfig,
244) -> Result<(String, ExtendedMetadata)> {
245 use std::cell::RefCell;
246 use std::rc::Rc;
247
248 let options = options.unwrap_or_default();
249
250 let normalized_html = if html.contains('\r') {
251 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
252 } else {
253 Cow::Borrowed(html)
254 };
255
256 let metadata_collector = Rc::new(RefCell::new(metadata::MetadataCollector::new(metadata_cfg)));
257
258 let markdown =
259 converter::convert_html_with_metadata(normalized_html.as_ref(), &options, Rc::clone(&metadata_collector))?;
260
261 let markdown = if options.wrap {
262 wrapper::wrap_markdown(&markdown, &options)
263 } else {
264 markdown
265 };
266
267 let metadata_collector = Rc::try_unwrap(metadata_collector)
268 .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
269 .into_inner();
270 let metadata = metadata_collector.finish();
271
272 Ok((markdown, metadata))
273}
274
275#[cfg(all(test, feature = "metadata"))]
276mod tests {
277 use super::*;
278
279 #[test]
280 fn test_convert_with_metadata_full_workflow() {
281 let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
282
283 let config = MetadataConfig {
284 extract_document: true,
285 extract_headers: true,
286 extract_links: true,
287 extract_images: true,
288 extract_structured_data: true,
289 max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
290 };
291
292 let (markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
293
294 assert!(!markdown.is_empty());
296 assert!(markdown.contains("Main Title"));
297 assert!(markdown.contains("Subsection"));
298
299 assert_eq!(metadata.document.language, Some("en".to_string()));
301
302 assert_eq!(metadata.headers.len(), 2);
304 assert_eq!(metadata.headers[0].level, 1);
305 assert_eq!(metadata.headers[0].text, "Main Title");
306 assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
307 assert_eq!(metadata.headers[1].level, 2);
308 assert_eq!(metadata.headers[1].text, "Subsection");
309
310 assert!(metadata.links.len() >= 2);
312 let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
313 assert!(external_link.is_some());
314 let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
315 assert!(anchor_link.is_some());
316
317 assert_eq!(metadata.images.len(), 1);
319 assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
320 assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
321 assert_eq!(metadata.images[0].image_type, ImageType::External);
322 }
323
324 #[test]
325 fn test_convert_with_metadata_document_fields() {
326 let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
327
328 let (_markdown, metadata) =
329 convert_with_metadata(html, None, MetadataConfig::default()).expect("conversion should succeed");
330
331 assert_eq!(
332 metadata.document.title,
333 Some("Test Article".to_string()),
334 "document: {:?}",
335 metadata.document
336 );
337 assert_eq!(metadata.document.description, Some("Desc".to_string()));
338 assert_eq!(metadata.document.author, Some("Author".to_string()));
339 assert_eq!(metadata.document.language, Some("en".to_string()));
340 assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
341 assert_eq!(
342 metadata.document.open_graph.get("description"),
343 Some(&"OG Desc".to_string())
344 );
345 }
346
347 #[test]
348 fn test_convert_with_metadata_empty_config() {
349 let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
350
351 let config = MetadataConfig {
352 extract_document: false,
353 extract_headers: false,
354 extract_links: false,
355 extract_images: false,
356 extract_structured_data: false,
357 max_structured_data_size: 0,
358 };
359
360 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
361
362 assert!(metadata.headers.is_empty());
364 assert!(metadata.links.is_empty());
365 assert!(metadata.images.is_empty());
366 assert_eq!(metadata.document.language, Some("en".to_string()));
368 }
369
370 #[test]
371 fn test_convert_with_metadata_data_uri_image() {
372 let html = "<html><body><img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"Pixel\"></body></html>";
373
374 let config = MetadataConfig::default();
375
376 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
377
378 assert_eq!(metadata.images.len(), 1);
379 assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
380 assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
381 }
382
383 #[test]
384 fn test_convert_with_metadata_relative_paths() {
385 let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
386
387 let config = MetadataConfig::default();
388
389 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
390
391 let internal_links: Vec<_> = metadata
392 .links
393 .iter()
394 .filter(|l| l.link_type == LinkType::Internal)
395 .collect();
396 assert_eq!(internal_links.len(), 2);
397 }
398}