html_to_markdown_rs/
lib.rs

1#![allow(
2    clippy::too_many_lines,
3    clippy::option_if_let_else,
4    clippy::match_wildcard_for_single_variants,
5    clippy::needless_pass_by_value,
6    clippy::struct_excessive_bools,
7    clippy::fn_params_excessive_bools,
8    clippy::branches_sharing_code,
9    clippy::match_same_arms,
10    clippy::missing_errors_doc,
11    clippy::items_after_statements,
12    clippy::doc_markdown,
13    clippy::cast_sign_loss,
14    clippy::default_trait_access,
15    clippy::unused_self,
16    clippy::cast_precision_loss,
17    clippy::collapsible_if,
18    clippy::too_many_arguments,
19    clippy::collapsible_else_if,
20    clippy::extra_unused_lifetimes,
21    clippy::unnecessary_lazy_evaluations,
22    clippy::must_use_candidate,
23    clippy::trivially_copy_pass_by_ref,
24    clippy::explicit_iter_loop,
25    clippy::missing_const_for_fn,
26    clippy::manual_assert,
27    clippy::return_self_not_must_use,
28    clippy::collapsible_match,
29    clippy::cast_possible_truncation,
30    clippy::map_unwrap_or,
31    clippy::manual_let_else,
32    clippy::used_underscore_binding,
33    clippy::assigning_clones,
34    clippy::uninlined_format_args
35)]
36#![allow(dead_code)]
37
38//! High-performance HTML to Markdown converter.
39//!
40//! Built with html5ever for fast, memory-efficient HTML parsing.
41//!
42//! ## Optional inline image extraction
43//!
44//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
45//! assets alongside the produced Markdown.
46
47// ============================================================================
48// Module Declarations
49// ============================================================================
50
51pub mod converter;
52pub mod error;
53pub mod hocr;
54#[cfg(feature = "inline-images")]
55mod inline_images;
56#[cfg(feature = "metadata")]
57pub mod metadata;
58pub mod options;
59pub mod safety;
60pub mod text;
61#[cfg(feature = "visitor")]
62pub mod visitor;
63#[cfg(feature = "visitor")]
64pub mod visitor_helpers;
65pub mod wrapper;
66
67// Internal modules (not part of public API)
68mod convert_api;
69mod exports;
70pub mod prelude;
71mod validation;
72
73// ============================================================================
74// Public Re-exports (from exports module)
75// ============================================================================
76
77pub use exports::*;
78
79// ============================================================================
80// Main Public API Functions
81// ============================================================================
82
83pub use convert_api::convert;
84
85#[cfg(any(feature = "serde", feature = "metadata"))]
86pub use convert_api::{conversion_options_from_json, conversion_options_update_from_json};
87
88#[cfg(feature = "metadata")]
89pub use convert_api::metadata_config_from_json;
90
91#[cfg(feature = "inline-images")]
92pub use convert_api::{convert_with_inline_images, inline_image_config_from_json};
93
94#[cfg(feature = "metadata")]
95pub use convert_api::convert_with_metadata;
96
97#[cfg(feature = "visitor")]
98pub use convert_api::convert_with_visitor;
99
100#[cfg(feature = "async-visitor")]
101pub use convert_api::convert_with_async_visitor;
102
103// Tests
104// ============================================================================
105
106#[cfg(all(test, feature = "metadata"))]
107mod tests {
108    use super::*;
109
110    #[test]
111    fn test_convert_with_metadata_full_workflow() {
112        let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
113
114        let config = MetadataConfig {
115            extract_document: true,
116            extract_headers: true,
117            extract_links: true,
118            extract_images: true,
119            extract_structured_data: true,
120            max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
121        };
122
123        let (markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
124
125        assert!(!markdown.is_empty());
126        assert!(markdown.contains("Main Title"));
127        assert!(markdown.contains("Subsection"));
128
129        assert_eq!(metadata.document.language, Some("en".to_string()));
130
131        assert_eq!(metadata.headers.len(), 2);
132        assert_eq!(metadata.headers[0].level, 1);
133        assert_eq!(metadata.headers[0].text, "Main Title");
134        assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
135        assert_eq!(metadata.headers[1].level, 2);
136        assert_eq!(metadata.headers[1].text, "Subsection");
137
138        assert!(metadata.links.len() >= 2);
139        let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
140        assert!(external_link.is_some());
141        let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
142        assert!(anchor_link.is_some());
143
144        assert_eq!(metadata.images.len(), 1);
145        assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
146        assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
147        assert_eq!(metadata.images[0].image_type, ImageType::External);
148    }
149
150    #[test]
151    fn test_convert_with_metadata_document_fields() {
152        let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
153
154        let (_markdown, metadata) =
155            convert_with_metadata(html, None, MetadataConfig::default(), None).expect("conversion should succeed");
156
157        assert_eq!(
158            metadata.document.title,
159            Some("Test Article".to_string()),
160            "document: {:?}",
161            metadata.document
162        );
163        assert_eq!(metadata.document.description, Some("Desc".to_string()));
164        assert_eq!(metadata.document.author, Some("Author".to_string()));
165        assert_eq!(metadata.document.language, Some("en".to_string()));
166        assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
167        assert_eq!(
168            metadata.document.open_graph.get("description"),
169            Some(&"OG Desc".to_string())
170        );
171    }
172
173    #[test]
174    fn test_convert_with_metadata_empty_config() {
175        let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
176
177        let config = MetadataConfig {
178            extract_document: false,
179            extract_headers: false,
180            extract_links: false,
181            extract_images: false,
182            extract_structured_data: false,
183            max_structured_data_size: 0,
184        };
185
186        let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
187
188        assert!(metadata.headers.is_empty());
189        assert!(metadata.links.is_empty());
190        assert!(metadata.images.is_empty());
191        assert_eq!(metadata.document.language, None);
192    }
193
194    #[test]
195    fn test_convert_with_metadata_data_uri_image() {
196        let html = "<html><body><img src=\"\" alt=\"Pixel\"></body></html>";
197
198        let config = MetadataConfig::default();
199
200        let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
201
202        assert_eq!(metadata.images.len(), 1);
203        assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
204        assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
205    }
206
207    #[test]
208    fn test_convert_with_metadata_relative_paths() {
209        let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
210
211        let config = MetadataConfig::default();
212
213        let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
214
215        let internal_links: Vec<_> = metadata
216            .links
217            .iter()
218            .filter(|l| l.link_type == LinkType::Internal)
219            .collect();
220        assert_eq!(internal_links.len(), 2);
221    }
222}
223
224#[cfg(test)]
225mod basic_tests {
226    use super::*;
227
228    #[test]
229    fn test_binary_input_rejected() {
230        let html = "PDF\0DATA";
231        let result = convert(html, None);
232        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
233    }
234
235    #[test]
236    fn test_binary_magic_rejected() {
237        let html = String::from_utf8_lossy(b"\x1F\x8B\x08\x00gzip").to_string();
238        let result = convert(&html, None);
239        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
240    }
241
242    #[test]
243    fn test_utf16_hint_rejected() {
244        let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
245        let result = convert(&html, None);
246        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
247    }
248
249    #[test]
250    fn test_plain_text_allowed() {
251        let result = convert("Just text", None).unwrap();
252        assert!(result.contains("Just text"));
253    }
254
255    #[test]
256    fn test_plain_text_escaped_when_enabled() {
257        let options = ConversionOptions {
258            escape_asterisks: true,
259            escape_underscores: true,
260            ..ConversionOptions::default()
261        };
262        let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
263        assert!(result.contains(r"\*asterisks\*"));
264        assert!(result.contains(r"\_underscores\_"));
265    }
266}