1#![allow(
2 clippy::too_many_lines,
3 clippy::option_if_let_else,
4 clippy::match_wildcard_for_single_variants,
5 clippy::needless_pass_by_value,
6 clippy::struct_excessive_bools,
7 clippy::fn_params_excessive_bools,
8 clippy::branches_sharing_code,
9 clippy::match_same_arms,
10 clippy::missing_errors_doc,
11 clippy::items_after_statements,
12 clippy::doc_markdown,
13 clippy::cast_sign_loss,
14 clippy::default_trait_access,
15 clippy::unused_self,
16 clippy::cast_precision_loss,
17 clippy::collapsible_if,
18 clippy::too_many_arguments,
19 clippy::collapsible_else_if,
20 clippy::extra_unused_lifetimes,
21 clippy::unnecessary_lazy_evaluations,
22 clippy::must_use_candidate,
23 clippy::trivially_copy_pass_by_ref,
24 clippy::explicit_iter_loop,
25 clippy::missing_const_for_fn,
26 clippy::manual_assert,
27 clippy::return_self_not_must_use,
28 clippy::collapsible_match,
29 clippy::cast_possible_truncation,
30 clippy::map_unwrap_or,
31 clippy::manual_let_else,
32 clippy::used_underscore_binding,
33 clippy::assigning_clones,
34 clippy::uninlined_format_args
35)]
36#![allow(dead_code)]
37
38pub mod converter;
52pub mod error;
53pub mod hocr;
54#[cfg(feature = "inline-images")]
55mod inline_images;
56#[cfg(feature = "metadata")]
57pub mod metadata;
58pub mod options;
59pub mod safety;
60pub mod text;
61#[cfg(feature = "visitor")]
62pub mod visitor;
63#[cfg(feature = "visitor")]
64pub mod visitor_helpers;
65pub mod wrapper;
66
67mod convert_api;
69mod exports;
70pub mod prelude;
71mod validation;
72
73pub use exports::*;
78
79pub use convert_api::convert;
84
85#[cfg(any(feature = "serde", feature = "metadata"))]
86pub use convert_api::{conversion_options_from_json, conversion_options_update_from_json};
87
88#[cfg(feature = "metadata")]
89pub use convert_api::metadata_config_from_json;
90
91#[cfg(feature = "inline-images")]
92pub use convert_api::{convert_with_inline_images, inline_image_config_from_json};
93
94#[cfg(feature = "metadata")]
95pub use convert_api::convert_with_metadata;
96
97#[cfg(feature = "visitor")]
98pub use convert_api::convert_with_visitor;
99
100#[cfg(feature = "async-visitor")]
101pub use convert_api::convert_with_async_visitor;
102
103#[cfg(all(test, feature = "metadata"))]
107mod tests {
108 use super::*;
109
110 #[test]
111 fn test_convert_with_metadata_full_workflow() {
112 let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
113
114 let config = MetadataConfig {
115 extract_document: true,
116 extract_headers: true,
117 extract_links: true,
118 extract_images: true,
119 extract_structured_data: true,
120 max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
121 };
122
123 let (markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
124
125 assert!(!markdown.is_empty());
126 assert!(markdown.contains("Main Title"));
127 assert!(markdown.contains("Subsection"));
128
129 assert_eq!(metadata.document.language, Some("en".to_string()));
130
131 assert_eq!(metadata.headers.len(), 2);
132 assert_eq!(metadata.headers[0].level, 1);
133 assert_eq!(metadata.headers[0].text, "Main Title");
134 assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
135 assert_eq!(metadata.headers[1].level, 2);
136 assert_eq!(metadata.headers[1].text, "Subsection");
137
138 assert!(metadata.links.len() >= 2);
139 let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
140 assert!(external_link.is_some());
141 let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
142 assert!(anchor_link.is_some());
143
144 assert_eq!(metadata.images.len(), 1);
145 assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
146 assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
147 assert_eq!(metadata.images[0].image_type, ImageType::External);
148 }
149
150 #[test]
151 fn test_convert_with_metadata_document_fields() {
152 let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
153
154 let (_markdown, metadata) =
155 convert_with_metadata(html, None, MetadataConfig::default(), None).expect("conversion should succeed");
156
157 assert_eq!(
158 metadata.document.title,
159 Some("Test Article".to_string()),
160 "document: {:?}",
161 metadata.document
162 );
163 assert_eq!(metadata.document.description, Some("Desc".to_string()));
164 assert_eq!(metadata.document.author, Some("Author".to_string()));
165 assert_eq!(metadata.document.language, Some("en".to_string()));
166 assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
167 assert_eq!(
168 metadata.document.open_graph.get("description"),
169 Some(&"OG Desc".to_string())
170 );
171 }
172
173 #[test]
174 fn test_convert_with_metadata_empty_config() {
175 let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
176
177 let config = MetadataConfig {
178 extract_document: false,
179 extract_headers: false,
180 extract_links: false,
181 extract_images: false,
182 extract_structured_data: false,
183 max_structured_data_size: 0,
184 };
185
186 let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
187
188 assert!(metadata.headers.is_empty());
189 assert!(metadata.links.is_empty());
190 assert!(metadata.images.is_empty());
191 assert_eq!(metadata.document.language, None);
192 }
193
194 #[test]
195 fn test_convert_with_metadata_data_uri_image() {
196 let html = "<html><body><img src=\"\" alt=\"Pixel\"></body></html>";
197
198 let config = MetadataConfig::default();
199
200 let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
201
202 assert_eq!(metadata.images.len(), 1);
203 assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
204 assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
205 }
206
207 #[test]
208 fn test_convert_with_metadata_relative_paths() {
209 let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
210
211 let config = MetadataConfig::default();
212
213 let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
214
215 let internal_links: Vec<_> = metadata
216 .links
217 .iter()
218 .filter(|l| l.link_type == LinkType::Internal)
219 .collect();
220 assert_eq!(internal_links.len(), 2);
221 }
222}
223
224#[cfg(test)]
225mod basic_tests {
226 use super::*;
227
228 #[test]
229 fn test_binary_input_rejected() {
230 let html = "PDF\0DATA";
231 let result = convert(html, None);
232 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
233 }
234
235 #[test]
236 fn test_binary_magic_rejected() {
237 let html = String::from_utf8_lossy(b"\x1F\x8B\x08\x00gzip").to_string();
238 let result = convert(&html, None);
239 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
240 }
241
242 #[test]
243 fn test_utf16_hint_rejected() {
244 let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
245 let result = convert(&html, None);
246 assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
247 }
248
249 #[test]
250 fn test_plain_text_allowed() {
251 let result = convert("Just text", None).unwrap();
252 assert!(result.contains("Just text"));
253 }
254
255 #[test]
256 fn test_plain_text_escaped_when_enabled() {
257 let options = ConversionOptions {
258 escape_asterisks: true,
259 escape_underscores: true,
260 ..ConversionOptions::default()
261 };
262 let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
263 assert!(result.contains(r"\*asterisks\*"));
264 assert!(result.contains(r"\_underscores\_"));
265 }
266}