1use std::borrow::Cow;
11
12pub mod converter;
13pub mod error;
14pub mod hocr;
15#[cfg(feature = "inline-images")]
16mod inline_images;
17#[cfg(feature = "metadata")]
18pub mod metadata;
19pub mod options;
20pub mod safety;
21pub mod text;
22#[cfg(feature = "visitor")]
23pub mod visitor;
24#[cfg(feature = "visitor")]
25pub mod visitor_helpers;
26#[cfg(feature = "async-visitor")]
27pub use visitor_helpers::AsyncVisitorHandle;
28pub mod wrapper;
29
30pub use error::{ConversionError, Result};
31#[cfg(feature = "inline-images")]
32pub use inline_images::{
33 DEFAULT_INLINE_IMAGE_LIMIT, HtmlExtraction, InlineImage, InlineImageConfig, InlineImageConfigUpdate,
34 InlineImageFormat, InlineImageSource, InlineImageWarning,
35};
36#[cfg(feature = "metadata")]
37pub use metadata::{
38 DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType,
39 LinkMetadata, LinkType, MetadataConfig, MetadataConfigUpdate, StructuredData, StructuredDataType, TextDirection,
40};
41pub use options::{
42 CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, ListIndentType,
43 NewlineStyle, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
44};
45
46const BINARY_SCAN_LIMIT: usize = 8192;
47const BINARY_CONTROL_RATIO: f64 = 0.3;
48const BINARY_UTF16_NULL_RATIO: f64 = 0.2;
49
50const BINARY_MAGIC_PREFIXES: &[(&[u8], &str)] = &[
51 (b"\x1F\x8B", "gzip-compressed data"),
52 (b"\x28\xB5\x2F\xFD", "zstd-compressed data"),
53 (b"PK\x03\x04", "zip archive"),
54 (b"PK\x05\x06", "zip archive"),
55 (b"PK\x07\x08", "zip archive"),
56 (b"%PDF-", "PDF data"),
57];
58
59fn validate_input(html: &str) -> Result<()> {
60 let bytes = html.as_bytes();
61 if bytes.is_empty() {
62 return Ok(());
63 }
64
65 if let Some(label) = detect_binary_magic(bytes) {
66 return Err(ConversionError::InvalidInput(format!(
67 "binary data detected ({label}); decode/decompress to UTF-8 HTML first"
68 )));
69 }
70
71 let sample_len = bytes.len().min(BINARY_SCAN_LIMIT);
72 let mut control_count = 0usize;
73 let mut nul_count = 0usize;
74 let mut even_nul_count = 0usize;
75 let mut odd_nul_count = 0usize;
76
77 for (idx, &byte) in bytes[..sample_len].iter().enumerate() {
78 if byte == 0 {
79 nul_count += 1;
80 if idx % 2 == 0 {
81 even_nul_count += 1;
82 } else {
83 odd_nul_count += 1;
84 }
85 }
86 let is_control = (byte < 0x09) || (0x0E..0x20).contains(&byte);
87 if is_control {
88 control_count += 1;
89 }
90 }
91
92 if nul_count > 0 {
93 if let Some(label) = detect_utf16_hint(bytes, sample_len, nul_count, even_nul_count, odd_nul_count) {
94 return Err(ConversionError::InvalidInput(format!(
95 "binary data detected ({label}); decode to UTF-8 HTML first"
96 )));
97 }
98 return Err(ConversionError::InvalidInput("binary data detected".to_string()));
99 }
100
101 let control_ratio = control_count as f64 / sample_len as f64;
102 if control_ratio > BINARY_CONTROL_RATIO {
103 return Err(ConversionError::InvalidInput(
104 "binary data detected (excess control bytes)".to_string(),
105 ));
106 }
107
108 Ok(())
109}
110
111fn detect_binary_magic(bytes: &[u8]) -> Option<&'static str> {
112 for (prefix, label) in BINARY_MAGIC_PREFIXES {
113 if bytes.starts_with(prefix) {
114 return Some(*label);
115 }
116 }
117 None
118}
119
120fn detect_utf16_hint(
121 bytes: &[u8],
122 sample_len: usize,
123 nul_count: usize,
124 even_nul_count: usize,
125 odd_nul_count: usize,
126) -> Option<&'static str> {
127 if bytes.len() >= 2 {
128 if bytes.starts_with(b"\xFF\xFE") {
129 return Some("UTF-16LE BOM");
130 }
131 if bytes.starts_with(b"\xFE\xFF") {
132 return Some("UTF-16BE BOM");
133 }
134 }
135
136 let nul_ratio = nul_count as f64 / sample_len as f64;
137 if nul_ratio < BINARY_UTF16_NULL_RATIO {
138 return None;
139 }
140
141 let dominant_ratio = (even_nul_count.max(odd_nul_count) as f64) / nul_count as f64;
142 if dominant_ratio >= 0.9 {
143 Some("UTF-16 data without BOM")
144 } else {
145 None
146 }
147}
148
149fn normalize_line_endings(html: &str) -> Cow<'_, str> {
150 if html.contains('\r') {
151 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
152 } else {
153 Cow::Borrowed(html)
154 }
155}
156
157fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
158 if html.contains('<') {
159 return None;
160 }
161
162 let mut decoded = text::decode_html_entities_cow(html);
163 if options.strip_newlines && (decoded.contains('\n') || decoded.contains('\r')) {
164 decoded = Cow::Owned(decoded.replace(&['\r', '\n'][..], " "));
165 }
166 let trimmed = decoded.trim_end_matches('\n');
167 if trimmed.is_empty() {
168 return Some(String::new());
169 }
170
171 let normalized = if options.whitespace_mode == WhitespaceMode::Normalized {
172 text::normalize_whitespace_cow(trimmed)
173 } else {
174 Cow::Borrowed(trimmed)
175 };
176
177 let escaped =
178 if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
179 text::escape(
180 normalized.as_ref(),
181 options.escape_misc,
182 options.escape_asterisks,
183 options.escape_underscores,
184 options.escape_ascii,
185 )
186 } else {
187 normalized.into_owned()
188 };
189
190 let mut output = String::with_capacity(escaped.len() + 1);
191 output.push_str(&escaped);
192 while output.ends_with(' ') || output.ends_with('\t') {
193 output.pop();
194 }
195 output.push('\n');
196 Some(output)
197}
198
199#[cfg(any(feature = "serde", feature = "metadata"))]
200fn parse_json<T: serde::de::DeserializeOwned>(json: &str) -> Result<T> {
201 serde_json::from_str(json).map_err(|err| ConversionError::ConfigError(err.to_string()))
202}
203
204#[cfg(any(feature = "serde", feature = "metadata"))]
205pub fn conversion_options_from_json(json: &str) -> Result<ConversionOptions> {
206 let update: ConversionOptionsUpdate = parse_json(json)?;
207 Ok(ConversionOptions::from(update))
208}
209
210#[cfg(any(feature = "serde", feature = "metadata"))]
211pub fn conversion_options_update_from_json(json: &str) -> Result<ConversionOptionsUpdate> {
212 parse_json(json)
213}
214
215#[cfg(all(feature = "inline-images", any(feature = "serde", feature = "metadata")))]
216pub fn inline_image_config_from_json(json: &str) -> Result<InlineImageConfig> {
217 let update: InlineImageConfigUpdate = parse_json(json)?;
218 Ok(InlineImageConfig::from_update(update))
219}
220
221#[cfg(all(feature = "metadata", any(feature = "serde", feature = "metadata")))]
222pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
223 let update: MetadataConfigUpdate = parse_json(json)?;
224 Ok(MetadataConfig::from(update))
225}
226
227pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
244 validate_input(html)?;
245 let options = options.unwrap_or_default();
246
247 let normalized_html = normalize_line_endings(html);
248
249 if !options.wrap {
250 if let Some(markdown) = fast_text_only(normalized_html.as_ref(), &options) {
251 return Ok(markdown);
252 }
253 }
254
255 let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
256
257 if options.wrap {
258 Ok(wrapper::wrap_markdown(&markdown, &options))
259 } else {
260 Ok(markdown)
261 }
262}
263
264#[cfg(feature = "inline-images")]
265pub fn convert_with_inline_images(
275 html: &str,
276 options: Option<ConversionOptions>,
277 image_cfg: InlineImageConfig,
278) -> Result<HtmlExtraction> {
279 use std::cell::RefCell;
280 use std::rc::Rc;
281
282 validate_input(html)?;
283 let options = options.unwrap_or_default();
284
285 let normalized_html = normalize_line_endings(html);
286
287 let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
288
289 let markdown =
290 converter::convert_html_with_inline_collector(normalized_html.as_ref(), &options, Rc::clone(&collector))?;
291
292 let markdown = if options.wrap {
293 wrapper::wrap_markdown(&markdown, &options)
294 } else {
295 markdown
296 };
297
298 let collector = Rc::try_unwrap(collector)
299 .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
300 .into_inner();
301 let (inline_images, warnings) = collector.finish();
302
303 Ok(HtmlExtraction {
304 markdown,
305 inline_images,
306 warnings,
307 })
308}
309
310#[cfg(feature = "metadata")]
311pub fn convert_with_metadata(
429 html: &str,
430 options: Option<ConversionOptions>,
431 metadata_cfg: MetadataConfig,
432) -> Result<(String, ExtendedMetadata)> {
433 use std::cell::RefCell;
434 use std::rc::Rc;
435
436 validate_input(html)?;
437 let options = options.unwrap_or_default();
438 if !metadata_cfg.any_enabled() {
439 let markdown = convert(html, Some(options))?;
440 return Ok((markdown, ExtendedMetadata::default()));
441 }
442
443 let normalized_html = normalize_line_endings(html);
444
445 let metadata_collector = Rc::new(RefCell::new(metadata::MetadataCollector::new(metadata_cfg)));
446
447 let markdown =
448 converter::convert_html_with_metadata(normalized_html.as_ref(), &options, Rc::clone(&metadata_collector))?;
449
450 let markdown = if options.wrap {
451 wrapper::wrap_markdown(&markdown, &options)
452 } else {
453 markdown
454 };
455
456 let metadata_collector = Rc::try_unwrap(metadata_collector)
457 .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
458 .into_inner();
459 let metadata = metadata_collector.finish();
460
461 Ok((markdown, metadata))
462}
463
464#[cfg(feature = "visitor")]
500pub fn convert_with_visitor(
501 html: &str,
502 options: Option<ConversionOptions>,
503 visitor: Option<visitor::VisitorHandle>,
504) -> Result<String> {
505 validate_input(html)?;
506 let options = options.unwrap_or_default();
507
508 let normalized_html = normalize_line_endings(html);
509
510 let markdown = converter::convert_html_with_visitor(normalized_html.as_ref(), &options, visitor)?;
511
512 if options.wrap {
513 Ok(wrapper::wrap_markdown(&markdown, &options))
514 } else {
515 Ok(markdown)
516 }
517}
518
519#[cfg(feature = "async-visitor")]
520pub async fn convert_with_async_visitor(
585 html: &str,
586 options: Option<ConversionOptions>,
587 _visitor: Option<visitor_helpers::AsyncVisitorHandle>,
588) -> Result<String> {
589 validate_input(html)?;
590 let options = options.unwrap_or_default();
591
592 let normalized_html = normalize_line_endings(html);
593
594 let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
596
597 if options.wrap {
598 Ok(wrapper::wrap_markdown(&markdown, &options))
599 } else {
600 Ok(markdown)
601 }
602}
603
604#[cfg(all(test, feature = "metadata"))]
605mod tests {
606 use super::*;
607
608 #[test]
609 fn test_convert_with_metadata_full_workflow() {
610 let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
611
612 let config = MetadataConfig {
613 extract_document: true,
614 extract_headers: true,
615 extract_links: true,
616 extract_images: true,
617 extract_structured_data: true,
618 max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
619 };
620
621 let (markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
622
623 assert!(!markdown.is_empty());
624 assert!(markdown.contains("Main Title"));
625 assert!(markdown.contains("Subsection"));
626
627 assert_eq!(metadata.document.language, Some("en".to_string()));
628
629 assert_eq!(metadata.headers.len(), 2);
630 assert_eq!(metadata.headers[0].level, 1);
631 assert_eq!(metadata.headers[0].text, "Main Title");
632 assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
633 assert_eq!(metadata.headers[1].level, 2);
634 assert_eq!(metadata.headers[1].text, "Subsection");
635
636 assert!(metadata.links.len() >= 2);
637 let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
638 assert!(external_link.is_some());
639 let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
640 assert!(anchor_link.is_some());
641
642 assert_eq!(metadata.images.len(), 1);
643 assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
644 assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
645 assert_eq!(metadata.images[0].image_type, ImageType::External);
646 }
647
648 #[test]
649 fn test_convert_with_metadata_document_fields() {
650 let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
651
652 let (_markdown, metadata) =
653 convert_with_metadata(html, None, MetadataConfig::default()).expect("conversion should succeed");
654
655 assert_eq!(
656 metadata.document.title,
657 Some("Test Article".to_string()),
658 "document: {:?}",
659 metadata.document
660 );
661 assert_eq!(metadata.document.description, Some("Desc".to_string()));
662 assert_eq!(metadata.document.author, Some("Author".to_string()));
663 assert_eq!(metadata.document.language, Some("en".to_string()));
664 assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
665 assert_eq!(
666 metadata.document.open_graph.get("description"),
667 Some(&"OG Desc".to_string())
668 );
669 }
670
671 #[test]
672 fn test_convert_with_metadata_empty_config() {
673 let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
674
675 let config = MetadataConfig {
676 extract_document: false,
677 extract_headers: false,
678 extract_links: false,
679 extract_images: false,
680 extract_structured_data: false,
681 max_structured_data_size: 0,
682 };
683
684 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
685
686 assert!(metadata.headers.is_empty());
687 assert!(metadata.links.is_empty());
688 assert!(metadata.images.is_empty());
689 assert_eq!(metadata.document.language, None);
690 }
691
692 #[test]
693 fn test_convert_with_metadata_data_uri_image() {
694 let html = "<html><body><img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"Pixel\"></body></html>";
695
696 let config = MetadataConfig::default();
697
698 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
699
700 assert_eq!(metadata.images.len(), 1);
701 assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
702 assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
703 }
704
705 #[test]
706 fn test_convert_with_metadata_relative_paths() {
707 let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
708
709 let config = MetadataConfig::default();
710
711 let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
712
713 let internal_links: Vec<_> = metadata
714 .links
715 .iter()
716 .filter(|l| l.link_type == LinkType::Internal)
717 .collect();
718 assert_eq!(internal_links.len(), 2);
719 }
720}
721
722#[cfg(test)]
723mod basic_tests {
724 use super::*;
725
726 #[test]
727 fn test_binary_input_rejected() {
728 let html = "PDF\0DATA";
729 let result = convert(html, None);
730 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
731 }
732
733 #[test]
734 fn test_binary_magic_rejected() {
735 let html = String::from_utf8_lossy(b"\x1F\x8B\x08\x00gzip").to_string();
736 let result = convert(&html, None);
737 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
738 }
739
740 #[test]
741 fn test_utf16_hint_rejected() {
742 let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
743 let result = convert(&html, None);
744 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
745 }
746
747 #[test]
748 fn test_plain_text_allowed() {
749 let result = convert("Just text", None).unwrap();
750 assert!(result.contains("Just text"));
751 }
752
753 #[test]
754 fn test_plain_text_escaped_when_enabled() {
755 let options = ConversionOptions {
756 escape_asterisks: true,
757 escape_underscores: true,
758 ..ConversionOptions::default()
759 };
760 let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
761 assert!(result.contains(r"\*asterisks\*"));
762 assert!(result.contains(r"\_underscores\_"));
763 }
764}