1#![allow(
2 clippy::too_many_lines,
3 clippy::option_if_let_else,
4 clippy::match_wildcard_for_single_variants,
5 clippy::needless_pass_by_value,
6 clippy::struct_excessive_bools,
7 clippy::fn_params_excessive_bools,
8 clippy::branches_sharing_code,
9 clippy::match_same_arms,
10 clippy::missing_errors_doc,
11 clippy::items_after_statements,
12 clippy::doc_markdown,
13 clippy::cast_sign_loss,
14 clippy::default_trait_access,
15 clippy::unused_self,
16 clippy::cast_precision_loss,
17 clippy::collapsible_if,
18 clippy::too_many_arguments,
19 clippy::collapsible_else_if,
20 clippy::extra_unused_lifetimes,
21 clippy::unnecessary_lazy_evaluations,
22 clippy::must_use_candidate,
23 clippy::trivially_copy_pass_by_ref,
24 clippy::explicit_iter_loop,
25 clippy::missing_const_for_fn,
26 clippy::manual_assert,
27 clippy::return_self_not_must_use,
28 clippy::collapsible_match,
29 clippy::cast_possible_truncation,
30 clippy::map_unwrap_or,
31 clippy::manual_let_else,
32 clippy::used_underscore_binding,
33 clippy::assigning_clones,
34 clippy::uninlined_format_args
35)]
36#![allow(dead_code)]
37
38pub mod converter;
52pub mod error;
53pub mod hocr;
54#[cfg(feature = "inline-images")]
55mod inline_images;
56#[cfg(feature = "metadata")]
57pub mod metadata;
58pub mod options;
59pub mod safety;
60pub mod text;
61#[cfg(feature = "visitor")]
62pub mod visitor;
63#[cfg(feature = "visitor")]
64pub mod visitor_helpers;
65pub mod wrapper;
66
67mod convert_api;
69mod exports;
70pub mod prelude;
71mod rcdom;
72mod validation;
73
74pub use exports::*;
79
80pub use convert_api::convert;
85
86#[cfg(any(feature = "serde", feature = "metadata"))]
87pub use convert_api::{conversion_options_from_json, conversion_options_update_from_json};
88
89#[cfg(feature = "metadata")]
90pub use convert_api::metadata_config_from_json;
91
92#[cfg(feature = "inline-images")]
93pub use convert_api::{convert_with_inline_images, inline_image_config_from_json};
94
95#[cfg(feature = "metadata")]
96pub use convert_api::convert_with_metadata;
97
98#[cfg(feature = "visitor")]
99pub use convert_api::convert_with_visitor;
100
101#[cfg(feature = "visitor")]
102pub use convert_api::{ConversionWithTables, TableData, convert_with_tables};
103
104#[cfg(feature = "async-visitor")]
105pub use convert_api::convert_with_async_visitor;
106
107#[cfg(all(test, feature = "metadata"))]
111mod tests {
112 use super::*;
113
114 #[test]
115 fn test_convert_with_metadata_full_workflow() {
116 let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
117
118 let config = MetadataConfig {
119 extract_document: true,
120 extract_headers: true,
121 extract_links: true,
122 extract_images: true,
123 extract_structured_data: true,
124 max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
125 };
126
127 let (markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
128
129 assert!(!markdown.is_empty());
130 assert!(markdown.contains("Main Title"));
131 assert!(markdown.contains("Subsection"));
132
133 assert_eq!(metadata.document.language, Some("en".to_string()));
134
135 assert_eq!(metadata.headers.len(), 2);
136 assert_eq!(metadata.headers[0].level, 1);
137 assert_eq!(metadata.headers[0].text, "Main Title");
138 assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
139 assert_eq!(metadata.headers[1].level, 2);
140 assert_eq!(metadata.headers[1].text, "Subsection");
141
142 assert!(metadata.links.len() >= 2);
143 let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
144 assert!(external_link.is_some());
145 let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
146 assert!(anchor_link.is_some());
147
148 assert_eq!(metadata.images.len(), 1);
149 assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
150 assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
151 assert_eq!(metadata.images[0].image_type, ImageType::External);
152 }
153
154 #[test]
155 fn test_convert_with_metadata_document_fields() {
156 let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
157
158 let (_markdown, metadata) =
159 convert_with_metadata(html, None, MetadataConfig::default(), None).expect("conversion should succeed");
160
161 assert_eq!(
162 metadata.document.title,
163 Some("Test Article".to_string()),
164 "document: {:?}",
165 metadata.document
166 );
167 assert_eq!(metadata.document.description, Some("Desc".to_string()));
168 assert_eq!(metadata.document.author, Some("Author".to_string()));
169 assert_eq!(metadata.document.language, Some("en".to_string()));
170 assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
171 assert_eq!(
172 metadata.document.open_graph.get("description"),
173 Some(&"OG Desc".to_string())
174 );
175 }
176
177 #[test]
178 fn test_convert_with_metadata_empty_config() {
179 let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
180
181 let config = MetadataConfig {
182 extract_document: false,
183 extract_headers: false,
184 extract_links: false,
185 extract_images: false,
186 extract_structured_data: false,
187 max_structured_data_size: 0,
188 };
189
190 let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
191
192 assert!(metadata.headers.is_empty());
193 assert!(metadata.links.is_empty());
194 assert!(metadata.images.is_empty());
195 assert_eq!(metadata.document.language, None);
196 }
197
198 #[test]
199 fn test_convert_with_metadata_data_uri_image() {
200 let html = "<html><body><img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"Pixel\"></body></html>";
201
202 let config = MetadataConfig::default();
203
204 let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
205
206 assert_eq!(metadata.images.len(), 1);
207 assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
208 assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
209 }
210
211 #[test]
212 fn test_convert_with_metadata_relative_paths() {
213 let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
214
215 let config = MetadataConfig::default();
216
217 let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
218
219 let internal_links: Vec<_> = metadata
220 .links
221 .iter()
222 .filter(|l| l.link_type == LinkType::Internal)
223 .collect();
224 assert_eq!(internal_links.len(), 2);
225 }
226}
227
228#[cfg(test)]
229mod basic_tests {
230 use super::*;
231
232 #[test]
233 fn test_binary_input_rejected() {
234 let html = format!("abc{}def", "\0".repeat(20));
235 let result = convert(&html, None);
236 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
237 }
238
239 #[test]
240 fn test_binary_magic_rejected() {
241 let html = "%PDF-1.7";
242 let result = convert(html, None);
243 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
244 }
245
246 #[test]
247 fn test_utf16_hint_recovered() {
248 let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
249 let result = convert(&html, None);
250 assert!(result.is_ok(), "UTF-16 input should be recovered instead of rejected");
251 }
252
253 #[test]
254 fn test_plain_text_allowed() {
255 let result = convert("Just text", None).unwrap();
256 assert!(result.contains("Just text"));
257 }
258
259 #[test]
260 fn test_plain_text_escaped_when_enabled() {
261 let options = ConversionOptions {
262 escape_asterisks: true,
263 escape_underscores: true,
264 ..ConversionOptions::default()
265 };
266 let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
267 assert!(result.contains(r"\*asterisks\*"));
268 assert!(result.contains(r"\_underscores\_"));
269 }
270}