1use std::borrow::Cow;
11
12pub mod converter;
13pub mod error;
14pub mod hocr;
15#[cfg(feature = "inline-images")]
16mod inline_images;
17#[cfg(feature = "metadata")]
18pub mod metadata;
19pub mod options;
20pub mod safety;
21pub mod text;
22pub mod wrapper;
23
24pub use error::{ConversionError, Result};
25#[cfg(feature = "inline-images")]
26pub use inline_images::{
27 HtmlExtraction, InlineImage, InlineImageConfig, InlineImageFormat, InlineImageSource, InlineImageWarning,
28};
29#[cfg(feature = "metadata")]
30pub use metadata::{
31 DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType,
32 LinkMetadata, LinkType, MetadataConfig, StructuredData, StructuredDataType, TextDirection,
33};
34pub use options::{
35 CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
36 PreprocessingOptions, PreprocessingPreset, WhitespaceMode,
37};
38
39pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
56 let options = options.unwrap_or_default();
57
58 let normalized_html = if html.contains('\r') {
59 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
60 } else {
61 Cow::Borrowed(html)
62 };
63
64 let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
65
66 if options.wrap {
67 Ok(wrapper::wrap_markdown(&markdown, &options))
68 } else {
69 Ok(markdown)
70 }
71}
72
73#[cfg(feature = "inline-images")]
74pub fn convert_with_inline_images(
84 html: &str,
85 options: Option<ConversionOptions>,
86 image_cfg: InlineImageConfig,
87) -> Result<HtmlExtraction> {
88 use std::cell::RefCell;
89 use std::rc::Rc;
90
91 let options = options.unwrap_or_default();
92
93 let normalized_html = if html.contains('\r') {
94 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
95 } else {
96 Cow::Borrowed(html)
97 };
98
99 let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
100
101 let markdown =
102 converter::convert_html_with_inline_collector(normalized_html.as_ref(), &options, Rc::clone(&collector))?;
103
104 let markdown = if options.wrap {
105 wrapper::wrap_markdown(&markdown, &options)
106 } else {
107 markdown
108 };
109
110 let collector = Rc::try_unwrap(collector)
111 .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
112 .into_inner();
113 let (inline_images, warnings) = collector.finish();
114
115 Ok(HtmlExtraction {
116 markdown,
117 inline_images,
118 warnings,
119 })
120}
121
122#[cfg(feature = "metadata")]
123pub fn convert_with_metadata(
241 html: &str,
242 options: Option<ConversionOptions>,
243 metadata_cfg: MetadataConfig,
244) -> Result<(String, ExtendedMetadata)> {
245 use std::cell::RefCell;
246 use std::rc::Rc;
247
248 let options = options.unwrap_or_default();
249
250 let normalized_html = if html.contains('\r') {
251 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
252 } else {
253 Cow::Borrowed(html)
254 };
255
256 let metadata_collector = Rc::new(RefCell::new(metadata::MetadataCollector::new(metadata_cfg)));
257
258 let markdown =
259 converter::convert_html_with_metadata(normalized_html.as_ref(), &options, Rc::clone(&metadata_collector))?;
260
261 let markdown = if options.wrap {
262 wrapper::wrap_markdown(&markdown, &options)
263 } else {
264 markdown
265 };
266
267 let metadata_collector = Rc::try_unwrap(metadata_collector)
268 .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
269 .into_inner();
270 let metadata = metadata_collector.finish();
271
272 Ok((markdown, metadata))
273}
274
275#[cfg(all(test, feature = "metadata"))]
276mod tests {
277 use super::*;
278
279 #[test]
280 fn test_convert_with_metadata_full_workflow() {
281 let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
282
283 let config = MetadataConfig {
284 extract_document: true,
285 extract_headers: true,
286 extract_links: true,
287 extract_images: true,
288 extract_structured_data: true,
289 max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
290 };
291
292 let (markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
293
294 assert!(!markdown.is_empty());
295 assert!(markdown.contains("Main Title"));
296 assert!(markdown.contains("Subsection"));
297
298 assert_eq!(metadata.document.language, Some("en".to_string()));
299
300 assert_eq!(metadata.headers.len(), 2);
301 assert_eq!(metadata.headers[0].level, 1);
302 assert_eq!(metadata.headers[0].text, "Main Title");
303 assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
304 assert_eq!(metadata.headers[1].level, 2);
305 assert_eq!(metadata.headers[1].text, "Subsection");
306
307 assert!(metadata.links.len() >= 2);
308 let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
309 assert!(external_link.is_some());
310 let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
311 assert!(anchor_link.is_some());
312
313 assert_eq!(metadata.images.len(), 1);
314 assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
315 assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
316 assert_eq!(metadata.images[0].image_type, ImageType::External);
317 }
318
319 #[test]
320 fn test_convert_with_metadata_document_fields() {
321 let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
322
323 let (_markdown, metadata) =
324 convert_with_metadata(html, None, MetadataConfig::default()).expect("conversion should succeed");
325
326 assert_eq!(
327 metadata.document.title,
328 Some("Test Article".to_string()),
329 "document: {:?}",
330 metadata.document
331 );
332 assert_eq!(metadata.document.description, Some("Desc".to_string()));
333 assert_eq!(metadata.document.author, Some("Author".to_string()));
334 assert_eq!(metadata.document.language, Some("en".to_string()));
335 assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
336 assert_eq!(
337 metadata.document.open_graph.get("description"),
338 Some(&"OG Desc".to_string())
339 );
340 }
341
342 #[test]
343 fn test_convert_with_metadata_empty_config() {
344 let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
345
346 let config = MetadataConfig {
347 extract_document: false,
348 extract_headers: false,
349 extract_links: false,
350 extract_images: false,
351 extract_structured_data: false,
352 max_structured_data_size: 0,
353 };
354
355 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
356
357 assert!(metadata.headers.is_empty());
358 assert!(metadata.links.is_empty());
359 assert!(metadata.images.is_empty());
360 assert_eq!(metadata.document.language, Some("en".to_string()));
361 }
362
363 #[test]
364 fn test_convert_with_metadata_data_uri_image() {
365 let html = "<html><body><img src=\"\" alt=\"Pixel\"></body></html>";
366
367 let config = MetadataConfig::default();
368
369 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
370
371 assert_eq!(metadata.images.len(), 1);
372 assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
373 assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
374 }
375
376 #[test]
377 fn test_convert_with_metadata_relative_paths() {
378 let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
379
380 let config = MetadataConfig::default();
381
382 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
383
384 let internal_links: Vec<_> = metadata
385 .links
386 .iter()
387 .filter(|l| l.link_type == LinkType::Internal)
388 .collect();
389 assert_eq!(internal_links.len(), 2);
390 }
391}