1#![allow(
2 clippy::too_many_lines,
3 clippy::option_if_let_else,
4 clippy::match_wildcard_for_single_variants,
5 clippy::needless_pass_by_value,
6 clippy::struct_excessive_bools,
7 clippy::fn_params_excessive_bools,
8 clippy::branches_sharing_code,
9 clippy::match_same_arms,
10 clippy::missing_errors_doc,
11 clippy::items_after_statements,
12 clippy::doc_markdown,
13 clippy::cast_sign_loss,
14 clippy::default_trait_access,
15 clippy::unused_self,
16 clippy::cast_precision_loss,
17 clippy::collapsible_if,
18 clippy::too_many_arguments,
19 clippy::collapsible_else_if,
20 clippy::extra_unused_lifetimes,
21 clippy::unnecessary_lazy_evaluations,
22 clippy::must_use_candidate,
23 clippy::trivially_copy_pass_by_ref,
24 clippy::explicit_iter_loop,
25 clippy::missing_const_for_fn,
26 clippy::manual_assert,
27 clippy::return_self_not_must_use,
28 clippy::collapsible_match,
29 clippy::cast_possible_truncation,
30 clippy::map_unwrap_or,
31 clippy::manual_let_else,
32 clippy::used_underscore_binding,
33 clippy::assigning_clones,
34 clippy::uninlined_format_args
35)]
36#![allow(dead_code)]
37
38use std::borrow::Cow;
47
48pub mod converter;
49pub mod error;
50pub mod hocr;
51#[cfg(feature = "inline-images")]
52mod inline_images;
53#[cfg(feature = "metadata")]
54pub mod metadata;
55pub mod options;
56pub mod safety;
57pub mod text;
58#[cfg(feature = "visitor")]
59pub mod visitor;
60#[cfg(feature = "visitor")]
61pub mod visitor_helpers;
62#[cfg(feature = "async-visitor")]
63pub use visitor_helpers::AsyncVisitorHandle;
64pub mod wrapper;
65
66pub use error::{ConversionError, Result};
67#[cfg(feature = "inline-images")]
68pub use inline_images::{
69 DEFAULT_INLINE_IMAGE_LIMIT, HtmlExtraction, InlineImage, InlineImageConfig, InlineImageConfigUpdate,
70 InlineImageFormat, InlineImageSource, InlineImageWarning,
71};
72#[cfg(feature = "metadata")]
73pub use metadata::{
74 DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType,
75 LinkMetadata, LinkType, MetadataConfig, MetadataConfigUpdate, StructuredData, StructuredDataType, TextDirection,
76};
77pub use options::{
78 CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, ListIndentType,
79 NewlineStyle, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
80};
81
82const BINARY_SCAN_LIMIT: usize = 8192;
83const BINARY_CONTROL_RATIO: f64 = 0.3;
84const BINARY_UTF16_NULL_RATIO: f64 = 0.2;
85
86const BINARY_MAGIC_PREFIXES: &[(&[u8], &str)] = &[
87 (b"\x1F\x8B", "gzip-compressed data"),
88 (b"\x28\xB5\x2F\xFD", "zstd-compressed data"),
89 (b"PK\x03\x04", "zip archive"),
90 (b"PK\x05\x06", "zip archive"),
91 (b"PK\x07\x08", "zip archive"),
92 (b"%PDF-", "PDF data"),
93];
94
95#[allow(clippy::cast_precision_loss)]
96fn validate_input(html: &str) -> Result<()> {
97 let bytes = html.as_bytes();
98 if bytes.is_empty() {
99 return Ok(());
100 }
101
102 if let Some(label) = detect_binary_magic(bytes) {
103 return Err(ConversionError::InvalidInput(format!(
104 "binary data detected ({label}); decode/decompress to UTF-8 HTML first"
105 )));
106 }
107
108 let sample_len = bytes.len().min(BINARY_SCAN_LIMIT);
109 let mut control_count = 0usize;
110 let mut nul_count = 0usize;
111 let mut even_nul_count = 0usize;
112 let mut odd_nul_count = 0usize;
113
114 for (idx, &byte) in bytes[..sample_len].iter().enumerate() {
115 if byte == 0 {
116 nul_count += 1;
117 if idx % 2 == 0 {
118 even_nul_count += 1;
119 } else {
120 odd_nul_count += 1;
121 }
122 }
123 let is_control = (byte < 0x09) || (0x0E..0x20).contains(&byte);
124 if is_control {
125 control_count += 1;
126 }
127 }
128
129 if nul_count > 0 {
130 if let Some(label) = detect_utf16_hint(bytes, sample_len, nul_count, even_nul_count, odd_nul_count) {
131 return Err(ConversionError::InvalidInput(format!(
132 "binary data detected ({label}); decode to UTF-8 HTML first"
133 )));
134 }
135 return Err(ConversionError::InvalidInput("binary data detected".to_string()));
136 }
137
138 let control_ratio = control_count as f64 / sample_len as f64;
139 if control_ratio > BINARY_CONTROL_RATIO {
140 return Err(ConversionError::InvalidInput(
141 "binary data detected (excess control bytes)".to_string(),
142 ));
143 }
144
145 Ok(())
146}
147
148fn detect_binary_magic(bytes: &[u8]) -> Option<&'static str> {
149 for (prefix, label) in BINARY_MAGIC_PREFIXES {
150 if bytes.starts_with(prefix) {
151 return Some(*label);
152 }
153 }
154 None
155}
156
157#[allow(clippy::cast_precision_loss)]
158fn detect_utf16_hint(
159 bytes: &[u8],
160 sample_len: usize,
161 nul_count: usize,
162 even_nul_count: usize,
163 odd_nul_count: usize,
164) -> Option<&'static str> {
165 if bytes.len() >= 2 {
166 if bytes.starts_with(b"\xFF\xFE") {
167 return Some("UTF-16LE BOM");
168 }
169 if bytes.starts_with(b"\xFE\xFF") {
170 return Some("UTF-16BE BOM");
171 }
172 }
173
174 #[allow(clippy::cast_precision_loss)]
175 let nul_ratio = nul_count as f64 / sample_len as f64;
176 if nul_ratio < BINARY_UTF16_NULL_RATIO {
177 return None;
178 }
179
180 #[allow(clippy::cast_precision_loss)]
181 let dominant_ratio = (even_nul_count.max(odd_nul_count) as f64) / nul_count as f64;
182 if dominant_ratio >= 0.9 {
183 Some("UTF-16 data without BOM")
184 } else {
185 None
186 }
187}
188
189fn normalize_line_endings(html: &str) -> Cow<'_, str> {
190 if html.contains('\r') {
191 Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
192 } else {
193 Cow::Borrowed(html)
194 }
195}
196
197fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
198 if html.contains('<') {
199 return None;
200 }
201
202 let mut decoded = text::decode_html_entities_cow(html);
203 if options.strip_newlines && (decoded.contains('\n') || decoded.contains('\r')) {
204 decoded = Cow::Owned(decoded.replace(&['\r', '\n'][..], " "));
205 }
206 let trimmed = decoded.trim_end_matches('\n');
207 if trimmed.is_empty() {
208 return Some(String::new());
209 }
210
211 let normalized = if options.whitespace_mode == WhitespaceMode::Normalized {
212 text::normalize_whitespace_cow(trimmed)
213 } else {
214 Cow::Borrowed(trimmed)
215 };
216
217 let escaped =
218 if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
219 text::escape(
220 normalized.as_ref(),
221 options.escape_misc,
222 options.escape_asterisks,
223 options.escape_underscores,
224 options.escape_ascii,
225 )
226 } else {
227 normalized.into_owned()
228 };
229
230 let mut output = String::with_capacity(escaped.len() + 1);
231 output.push_str(&escaped);
232 while output.ends_with(' ') || output.ends_with('\t') {
233 output.pop();
234 }
235 output.push('\n');
236 Some(output)
237}
238
239#[cfg(any(feature = "serde", feature = "metadata"))]
240fn parse_json<T: serde::de::DeserializeOwned>(json: &str) -> Result<T> {
241 serde_json::from_str(json).map_err(|err| ConversionError::ConfigError(err.to_string()))
242}
243
244#[cfg(any(feature = "serde", feature = "metadata"))]
245pub fn conversion_options_from_json(json: &str) -> Result<ConversionOptions> {
262 let update: ConversionOptionsUpdate = parse_json(json)?;
263 Ok(ConversionOptions::from(update))
264}
265
266#[cfg(any(feature = "serde", feature = "metadata"))]
267pub fn conversion_options_update_from_json(json: &str) -> Result<ConversionOptionsUpdate> {
284 parse_json(json)
285}
286
287#[cfg(all(feature = "inline-images", any(feature = "serde", feature = "metadata")))]
288pub fn inline_image_config_from_json(json: &str) -> Result<InlineImageConfig> {
305 let update: InlineImageConfigUpdate = parse_json(json)?;
306 Ok(InlineImageConfig::from_update(update))
307}
308
309#[cfg(all(feature = "metadata", any(feature = "serde", feature = "metadata")))]
310pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
327 let update: MetadataConfigUpdate = parse_json(json)?;
328 Ok(MetadataConfig::from(update))
329}
330
331pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
351 validate_input(html)?;
352 let options = options.unwrap_or_default();
353
354 let normalized_html = normalize_line_endings(html);
355
356 if !options.wrap {
357 if let Some(markdown) = fast_text_only(normalized_html.as_ref(), &options) {
358 return Ok(markdown);
359 }
360 }
361
362 let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
363
364 if options.wrap {
365 Ok(wrapper::wrap_markdown(&markdown, &options))
366 } else {
367 Ok(markdown)
368 }
369}
370
371#[cfg(feature = "inline-images")]
385pub fn convert_with_inline_images(
386 html: &str,
387 options: Option<ConversionOptions>,
388 image_cfg: InlineImageConfig,
389 #[cfg(feature = "visitor")] visitor: Option<visitor::VisitorHandle>,
390 #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
391) -> Result<HtmlExtraction> {
392 use std::cell::RefCell;
393 use std::rc::Rc;
394
395 validate_input(html)?;
396 let options = options.unwrap_or_default();
397
398 let normalized_html = normalize_line_endings(html);
399
400 let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
401
402 #[cfg(feature = "visitor")]
403 let markdown = converter::convert_html_impl(
404 normalized_html.as_ref(),
405 &options,
406 Some(Rc::clone(&collector)),
407 None,
408 visitor,
409 )?;
410 #[cfg(not(feature = "visitor"))]
411 let markdown = converter::convert_html_impl(
412 normalized_html.as_ref(),
413 &options,
414 Some(Rc::clone(&collector)),
415 None,
416 None,
417 )?;
418
419 let markdown = if options.wrap {
420 wrapper::wrap_markdown(&markdown, &options)
421 } else {
422 markdown
423 };
424
425 let collector = Rc::try_unwrap(collector)
426 .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
427 .into_inner();
428 let (inline_images, warnings) = collector.finish();
429
430 Ok(HtmlExtraction {
431 markdown,
432 inline_images,
433 warnings,
434 })
435}
436
437#[cfg(feature = "metadata")]
556pub fn convert_with_metadata(
557 html: &str,
558 options: Option<ConversionOptions>,
559 metadata_cfg: MetadataConfig,
560 #[cfg(feature = "visitor")] visitor: Option<visitor::VisitorHandle>,
561 #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
562) -> Result<(String, ExtendedMetadata)> {
563 use std::cell::RefCell;
564 use std::rc::Rc;
565
566 validate_input(html)?;
567 let options = options.unwrap_or_default();
568 if !metadata_cfg.any_enabled() {
569 let normalized_html = normalize_line_endings(html);
570 #[cfg(feature = "visitor")]
571 let markdown = converter::convert_html_impl(normalized_html.as_ref(), &options, None, None, visitor)?;
572 #[cfg(not(feature = "visitor"))]
573 let markdown = converter::convert_html_impl(normalized_html.as_ref(), &options, None, None, None)?;
574 let markdown = if options.wrap {
575 wrapper::wrap_markdown(&markdown, &options)
576 } else {
577 markdown
578 };
579 return Ok((markdown, ExtendedMetadata::default()));
580 }
581
582 let normalized_html = normalize_line_endings(html);
583
584 let metadata_collector = Rc::new(RefCell::new(metadata::MetadataCollector::new(metadata_cfg)));
585
586 #[cfg(feature = "visitor")]
587 let markdown = converter::convert_html_impl(
588 normalized_html.as_ref(),
589 &options,
590 None,
591 Some(Rc::clone(&metadata_collector)),
592 visitor,
593 )?;
594 #[cfg(not(feature = "visitor"))]
595 let markdown = converter::convert_html_impl(
596 normalized_html.as_ref(),
597 &options,
598 None,
599 Some(Rc::clone(&metadata_collector)),
600 None,
601 )?;
602
603 let markdown = if options.wrap {
604 wrapper::wrap_markdown(&markdown, &options)
605 } else {
606 markdown
607 };
608
609 let metadata_collector = Rc::try_unwrap(metadata_collector)
610 .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
611 .into_inner();
612 let metadata = metadata_collector.finish();
613
614 Ok((markdown, metadata))
615}
616
617#[cfg(feature = "visitor")]
653pub fn convert_with_visitor(
657 html: &str,
658 options: Option<ConversionOptions>,
659 visitor: Option<visitor::VisitorHandle>,
660) -> Result<String> {
661 validate_input(html)?;
662 let options = options.unwrap_or_default();
663
664 let normalized_html = normalize_line_endings(html);
665
666 let markdown = converter::convert_html_with_visitor(normalized_html.as_ref(), &options, visitor)?;
667
668 if options.wrap {
669 Ok(wrapper::wrap_markdown(&markdown, &options))
670 } else {
671 Ok(markdown)
672 }
673}
674
675#[cfg(feature = "async-visitor")]
676#[allow(clippy::future_not_send)]
741#[allow(clippy::unused_async)]
745pub async fn convert_with_async_visitor(
746 html: &str,
747 options: Option<ConversionOptions>,
748 _visitor: Option<visitor_helpers::AsyncVisitorHandle>,
749) -> Result<String> {
750 validate_input(html)?;
751 let options = options.unwrap_or_default();
752
753 let normalized_html = normalize_line_endings(html);
754
755 let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
757
758 if options.wrap {
759 Ok(wrapper::wrap_markdown(&markdown, &options))
760 } else {
761 Ok(markdown)
762 }
763}
764
765#[cfg(all(test, feature = "metadata"))]
766mod tests {
767 use super::*;
768
769 #[test]
770 fn test_convert_with_metadata_full_workflow() {
771 let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
772
773 let config = MetadataConfig {
774 extract_document: true,
775 extract_headers: true,
776 extract_links: true,
777 extract_images: true,
778 extract_structured_data: true,
779 max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
780 };
781
782 let (markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
783
784 assert!(!markdown.is_empty());
785 assert!(markdown.contains("Main Title"));
786 assert!(markdown.contains("Subsection"));
787
788 assert_eq!(metadata.document.language, Some("en".to_string()));
789
790 assert_eq!(metadata.headers.len(), 2);
791 assert_eq!(metadata.headers[0].level, 1);
792 assert_eq!(metadata.headers[0].text, "Main Title");
793 assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
794 assert_eq!(metadata.headers[1].level, 2);
795 assert_eq!(metadata.headers[1].text, "Subsection");
796
797 assert!(metadata.links.len() >= 2);
798 let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
799 assert!(external_link.is_some());
800 let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
801 assert!(anchor_link.is_some());
802
803 assert_eq!(metadata.images.len(), 1);
804 assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
805 assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
806 assert_eq!(metadata.images[0].image_type, ImageType::External);
807 }
808
809 #[test]
810 fn test_convert_with_metadata_document_fields() {
811 let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
812
813 let (_markdown, metadata) =
814 convert_with_metadata(html, None, MetadataConfig::default(), None).expect("conversion should succeed");
815
816 assert_eq!(
817 metadata.document.title,
818 Some("Test Article".to_string()),
819 "document: {:?}",
820 metadata.document
821 );
822 assert_eq!(metadata.document.description, Some("Desc".to_string()));
823 assert_eq!(metadata.document.author, Some("Author".to_string()));
824 assert_eq!(metadata.document.language, Some("en".to_string()));
825 assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
826 assert_eq!(
827 metadata.document.open_graph.get("description"),
828 Some(&"OG Desc".to_string())
829 );
830 }
831
832 #[test]
833 fn test_convert_with_metadata_empty_config() {
834 let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
835
836 let config = MetadataConfig {
837 extract_document: false,
838 extract_headers: false,
839 extract_links: false,
840 extract_images: false,
841 extract_structured_data: false,
842 max_structured_data_size: 0,
843 };
844
845 let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
846
847 assert!(metadata.headers.is_empty());
848 assert!(metadata.links.is_empty());
849 assert!(metadata.images.is_empty());
850 assert_eq!(metadata.document.language, None);
851 }
852
853 #[test]
854 fn test_convert_with_metadata_data_uri_image() {
855 let html = "<html><body><img src=\"\" alt=\"Pixel\"></body></html>";
856
857 let config = MetadataConfig::default();
858
859 let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
860
861 assert_eq!(metadata.images.len(), 1);
862 assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
863 assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
864 }
865
866 #[test]
867 fn test_convert_with_metadata_relative_paths() {
868 let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
869
870 let config = MetadataConfig::default();
871
872 let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
873
874 let internal_links: Vec<_> = metadata
875 .links
876 .iter()
877 .filter(|l| l.link_type == LinkType::Internal)
878 .collect();
879 assert_eq!(internal_links.len(), 2);
880 }
881}
882
883#[cfg(test)]
884mod basic_tests {
885 use super::*;
886
887 #[test]
888 fn test_binary_input_rejected() {
889 let html = "PDF\0DATA";
890 let result = convert(html, None);
891 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
892 }
893
894 #[test]
895 fn test_binary_magic_rejected() {
896 let html = String::from_utf8_lossy(b"\x1F\x8B\x08\x00gzip").to_string();
897 let result = convert(&html, None);
898 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
899 }
900
901 #[test]
902 fn test_utf16_hint_rejected() {
903 let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
904 let result = convert(&html, None);
905 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
906 }
907
908 #[test]
909 fn test_plain_text_allowed() {
910 let result = convert("Just text", None).unwrap();
911 assert!(result.contains("Just text"));
912 }
913
914 #[test]
915 fn test_plain_text_escaped_when_enabled() {
916 let options = ConversionOptions {
917 escape_asterisks: true,
918 escape_underscores: true,
919 ..ConversionOptions::default()
920 };
921 let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
922 assert!(result.contains(r"\*asterisks\*"));
923 assert!(result.contains(r"\_underscores\_"));
924 }
925}