Skip to main content

html_to_markdown_rs/
lib.rs

1#![allow(
2    clippy::too_many_lines,
3    clippy::option_if_let_else,
4    clippy::match_wildcard_for_single_variants,
5    clippy::needless_pass_by_value,
6    clippy::struct_excessive_bools,
7    clippy::fn_params_excessive_bools,
8    clippy::branches_sharing_code,
9    clippy::match_same_arms,
10    clippy::missing_errors_doc,
11    clippy::items_after_statements,
12    clippy::doc_markdown,
13    clippy::cast_sign_loss,
14    clippy::default_trait_access,
15    clippy::unused_self,
16    clippy::cast_precision_loss,
17    clippy::collapsible_if,
18    clippy::too_many_arguments,
19    clippy::collapsible_else_if,
20    clippy::extra_unused_lifetimes,
21    clippy::unnecessary_lazy_evaluations,
22    clippy::must_use_candidate,
23    clippy::trivially_copy_pass_by_ref,
24    clippy::explicit_iter_loop,
25    clippy::missing_const_for_fn,
26    clippy::manual_assert,
27    clippy::return_self_not_must_use,
28    clippy::collapsible_match,
29    clippy::cast_possible_truncation,
30    clippy::map_unwrap_or,
31    clippy::manual_let_else,
32    clippy::used_underscore_binding,
33    clippy::assigning_clones,
34    clippy::uninlined_format_args
35)]
36#![allow(dead_code)]
37
38//! High-performance HTML to Markdown converter.
39//!
40//! Built with html5ever for fast, memory-efficient HTML parsing.
41//!
42//! ## Optional inline image extraction
43//!
44//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
45//! assets alongside the produced Markdown.
46
47// ============================================================================
48// Module Declarations
49// ============================================================================
50
51pub mod converter;
52pub mod error;
53pub mod hocr;
54#[cfg(feature = "inline-images")]
55mod inline_images;
56#[cfg(feature = "metadata")]
57pub mod metadata;
58pub mod options;
59pub mod safety;
60pub mod text;
61#[cfg(feature = "visitor")]
62pub mod visitor;
63#[cfg(feature = "visitor")]
64pub mod visitor_helpers;
65pub mod wrapper;
66
67// Internal modules (not part of public API)
68mod convert_api;
69mod exports;
70pub mod prelude;
71mod rcdom;
72mod validation;
73
74// ============================================================================
75// Public Re-exports (from exports module)
76// ============================================================================
77
78pub use exports::*;
79
80// ============================================================================
81// Main Public API Functions
82// ============================================================================
83
84pub use convert_api::convert;
85
86#[cfg(any(feature = "serde", feature = "metadata"))]
87pub use convert_api::{conversion_options_from_json, conversion_options_update_from_json};
88
89#[cfg(feature = "metadata")]
90pub use convert_api::metadata_config_from_json;
91
92#[cfg(feature = "inline-images")]
93pub use convert_api::{convert_with_inline_images, inline_image_config_from_json};
94
95#[cfg(feature = "metadata")]
96pub use convert_api::convert_with_metadata;
97
98#[cfg(feature = "visitor")]
99pub use convert_api::convert_with_visitor;
100
101#[cfg(feature = "visitor")]
102pub use convert_api::{ConversionWithTables, TableData, convert_with_tables};
103
104#[cfg(feature = "async-visitor")]
105pub use convert_api::convert_with_async_visitor;
106
107// Tests
108// ============================================================================
109
110#[cfg(all(test, feature = "metadata"))]
111mod tests {
112    use super::*;
113
114    #[test]
115    fn test_convert_with_metadata_full_workflow() {
116        let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
117
118        let config = MetadataConfig {
119            extract_document: true,
120            extract_headers: true,
121            extract_links: true,
122            extract_images: true,
123            extract_structured_data: true,
124            max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
125        };
126
127        let (markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
128
129        assert!(!markdown.is_empty());
130        assert!(markdown.contains("Main Title"));
131        assert!(markdown.contains("Subsection"));
132
133        assert_eq!(metadata.document.language, Some("en".to_string()));
134
135        assert_eq!(metadata.headers.len(), 2);
136        assert_eq!(metadata.headers[0].level, 1);
137        assert_eq!(metadata.headers[0].text, "Main Title");
138        assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
139        assert_eq!(metadata.headers[1].level, 2);
140        assert_eq!(metadata.headers[1].text, "Subsection");
141
142        assert!(metadata.links.len() >= 2);
143        let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
144        assert!(external_link.is_some());
145        let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
146        assert!(anchor_link.is_some());
147
148        assert_eq!(metadata.images.len(), 1);
149        assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
150        assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
151        assert_eq!(metadata.images[0].image_type, ImageType::External);
152    }
153
154    #[test]
155    fn test_convert_with_metadata_document_fields() {
156        let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
157
158        let (_markdown, metadata) =
159            convert_with_metadata(html, None, MetadataConfig::default(), None).expect("conversion should succeed");
160
161        assert_eq!(
162            metadata.document.title,
163            Some("Test Article".to_string()),
164            "document: {:?}",
165            metadata.document
166        );
167        assert_eq!(metadata.document.description, Some("Desc".to_string()));
168        assert_eq!(metadata.document.author, Some("Author".to_string()));
169        assert_eq!(metadata.document.language, Some("en".to_string()));
170        assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
171        assert_eq!(
172            metadata.document.open_graph.get("description"),
173            Some(&"OG Desc".to_string())
174        );
175    }
176
177    #[test]
178    fn test_convert_with_metadata_empty_config() {
179        let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
180
181        let config = MetadataConfig {
182            extract_document: false,
183            extract_headers: false,
184            extract_links: false,
185            extract_images: false,
186            extract_structured_data: false,
187            max_structured_data_size: 0,
188        };
189
190        let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
191
192        assert!(metadata.headers.is_empty());
193        assert!(metadata.links.is_empty());
194        assert!(metadata.images.is_empty());
195        assert_eq!(metadata.document.language, None);
196    }
197
198    #[test]
199    fn test_convert_with_metadata_data_uri_image() {
200        let html = "<html><body><img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"Pixel\"></body></html>";
201
202        let config = MetadataConfig::default();
203
204        let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
205
206        assert_eq!(metadata.images.len(), 1);
207        assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
208        assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
209    }
210
211    #[test]
212    fn test_convert_with_metadata_relative_paths() {
213        let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
214
215        let config = MetadataConfig::default();
216
217        let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
218
219        let internal_links: Vec<_> = metadata
220            .links
221            .iter()
222            .filter(|l| l.link_type == LinkType::Internal)
223            .collect();
224        assert_eq!(internal_links.len(), 2);
225    }
226}
227
228#[cfg(test)]
229mod basic_tests {
230    use super::*;
231
232    #[test]
233    fn test_binary_input_rejected() {
234        let html = format!("abc{}def", "\0".repeat(20));
235        let result = convert(&html, None);
236        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
237    }
238
239    #[test]
240    fn test_binary_magic_rejected() {
241        let html = "%PDF-1.7";
242        let result = convert(html, None);
243        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
244    }
245
246    #[test]
247    fn test_utf16_hint_recovered() {
248        let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
249        let result = convert(&html, None);
250        assert!(result.is_ok(), "UTF-16 input should be recovered instead of rejected");
251    }
252
253    #[test]
254    fn test_plain_text_allowed() {
255        let result = convert("Just text", None).unwrap();
256        assert!(result.contains("Just text"));
257    }
258
259    #[test]
260    fn test_plain_text_escaped_when_enabled() {
261        let options = ConversionOptions {
262            escape_asterisks: true,
263            escape_underscores: true,
264            ..ConversionOptions::default()
265        };
266        let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
267        assert!(result.contains(r"\*asterisks\*"));
268        assert!(result.contains(r"\_underscores\_"));
269    }
270}