1mod converter;
40mod elements;
41pub mod options;
42mod utils;
43
44pub use options::MarkdownOptions;
45
46use dom_query::{Document, Selection};
47
48use converter::convert_node;
49use utils::normalize_output;
50
51#[must_use]
66pub fn to_markdown(doc: &Document) -> String {
67 to_markdown_with_options(doc, &MarkdownOptions::new())
68}
69
70#[must_use]
72pub fn to_markdown_with_options(doc: &Document, options: &MarkdownOptions) -> String {
73 let mut output = String::new();
74 let body = doc.select("body");
75
76 if body.exists() {
77 for node in body.children().nodes() {
78 let sel = Selection::from(*node);
79 convert_node(&sel, &mut output, options, 0);
80 }
81 } else {
82 for node in doc.select("*").first().children().nodes() {
84 let sel = Selection::from(*node);
85 convert_node(&sel, &mut output, options, 0);
86 }
87 }
88
89 normalize_output(&output)
90}
91
92#[must_use]
94pub fn element_to_markdown(element: &Selection) -> String {
95 element_to_markdown_with_options(element, &MarkdownOptions::new())
96}
97
98#[must_use]
100pub fn element_to_markdown_with_options(element: &Selection, options: &MarkdownOptions) -> String {
101 let mut output = String::new();
102 convert_node(element, &mut output, options, 0);
103 normalize_output(&output)
104}
105
106#[must_use]
118pub fn html_to_markdown(html: &str) -> String {
119 let doc = Document::from(html);
120 to_markdown(&doc)
121}
122
123#[must_use]
125pub fn html_to_markdown_with_options(html: &str, options: &MarkdownOptions) -> String {
126 let doc = Document::from(html);
127 to_markdown_with_options(&doc, options)
128}
129
130#[cfg(test)]
131mod tests {
132 use super::*;
133 use pretty_assertions::assert_eq;
134
135 #[test]
138 fn test_heading_conversion() {
139 let md = html_to_markdown("<h1>Title</h1><h2>Subtitle</h2>");
140 assert_eq!(md, "# Title\n\n## Subtitle\n");
141 }
142
143 #[test]
144 fn test_all_heading_levels() {
145 let html = "<h1>H1</h1><h2>H2</h2><h3>H3</h3><h4>H4</h4><h5>H5</h5><h6>H6</h6>";
146 let md = html_to_markdown(html);
147 assert!(md.contains("# H1"));
148 assert!(md.contains("## H2"));
149 assert!(md.contains("### H3"));
150 assert!(md.contains("#### H4"));
151 assert!(md.contains("##### H5"));
152 assert!(md.contains("###### H6"));
153 }
154
155 #[test]
156 fn test_paragraph() {
157 let md = html_to_markdown("<p>Hello world</p>");
158 assert_eq!(md, "Hello world\n");
159 }
160
161 #[test]
162 fn test_bold_and_italic() {
163 let md = html_to_markdown("<p><strong>bold</strong> and <em>italic</em></p>");
164 assert_eq!(md, "**bold** and *italic*\n");
165 }
166
167 #[test]
168 fn test_strikethrough() {
169 let md = html_to_markdown("<p><del>deleted</del> and <s>struck</s></p>");
170 assert_eq!(md, "~~deleted~~ and ~~struck~~\n");
171 }
172
173 #[test]
174 fn test_links() {
175 let md = html_to_markdown(r#"<a href="https://example.com">Link</a>"#);
176 assert_eq!(md, "[Link](https://example.com)\n");
177 }
178
179 #[test]
180 fn test_unordered_list() {
181 let md = html_to_markdown("<ul><li>One</li><li>Two</li></ul>");
182 assert_eq!(md, "- One\n- Two\n");
183 }
184
185 #[test]
186 fn test_ordered_list() {
187 let md = html_to_markdown("<ol><li>First</li><li>Second</li></ol>");
188 assert_eq!(md, "1. First\n2. Second\n");
189 }
190
191 #[test]
192 fn test_code_block() {
193 let md = html_to_markdown("<pre><code>fn main() {}</code></pre>");
194 assert!(md.contains("```"));
195 assert!(md.contains("fn main()"));
196 }
197
198 #[test]
199 fn test_blockquote() {
200 let md = html_to_markdown("<blockquote>Quote text</blockquote>");
201 assert_eq!(md, "> Quote text\n");
202 }
203
204 #[test]
207 fn test_nested_unordered_list() {
208 let html = "<ul><li>parent<ul><li>child</li></ul></li></ul>";
209 let md = html_to_markdown(html);
210 assert_eq!(md, "- parent\n - child\n");
211 }
212
213 #[test]
214 fn test_deeply_nested_list() {
215 let html = "<ul><li>L1<ul><li>L2<ul><li>L3</li></ul></li></ul></li></ul>";
216 let md = html_to_markdown(html);
217 assert_eq!(md, "- L1\n - L2\n - L3\n");
218 }
219
220 #[test]
221 fn test_siblings_with_nested_children() {
222 let html = "<ul><li>A<ul><li>A1</li></ul></li><li>B<ul><li>B1</li></ul></li></ul>";
223 let md = html_to_markdown(html);
224 assert_eq!(md.matches("A1").count(), 1);
226 assert_eq!(md.matches("B1").count(), 1);
227 assert_eq!(md, "- A\n - A1\n- B\n - B1\n");
228 }
229
230 #[test]
231 fn test_nested_ordered_list() {
232 let html = "<ol><li>first<ol><li>nested</li></ol></li><li>second</li></ol>";
233 let md = html_to_markdown(html);
234 assert!(md.contains("1. first"));
235 assert!(md.contains(" 1. nested"));
236 assert!(md.contains("2. second"));
237 }
238
239 #[test]
240 fn test_mixed_nested_lists() {
241 let html = "<ul><li>bullet<ol><li>num 1</li><li>num 2</li></ol></li></ul>";
242 let md = html_to_markdown(html);
243 assert!(md.contains("- bullet"));
244 assert!(md.contains(" 1. num 1"));
245 assert!(md.contains(" 2. num 2"));
246 }
247
248 #[test]
249 fn test_text_after_nested_list() {
250 let html = "<ul><li>before<ul><li>nested</li></ul>after</li></ul>";
251 let md = html_to_markdown(html);
252 assert_eq!(md.matches("after").count(), 1);
254 }
255
256 #[test]
259 fn test_simple_table() {
260 let html = "<table><tr><th>A</th><th>B</th></tr><tr><td>1</td><td>2</td></tr></table>";
261 let md = html_to_markdown(html);
262 assert!(md.contains("| A"));
263 assert!(md.contains("| B"));
264 assert!(md.contains("---"));
265 assert!(md.contains("| 1"));
266 assert!(md.contains("| 2"));
267 }
268
269 #[test]
270 fn test_table_with_pipes_in_content() {
271 let html = "<table><tr><th>Cmd</th></tr><tr><td>a | b</td></tr></table>";
272 let md = html_to_markdown(html);
273 assert!(md.contains(r"a \| b"));
274 }
275
276 #[test]
277 fn test_table_header_with_pipes() {
278 let html = "<table><thead><tr><th>A | B</th></tr></thead><tr><td>1</td></tr></table>";
279 let md = html_to_markdown(html);
280 assert!(md.contains(r"A \| B"));
281 }
282
283 #[test]
284 fn test_empty_table() {
285 let html = "<table></table>";
286 let md = html_to_markdown(html);
287 assert!(!md.contains("|"));
288 }
289
290 #[test]
293 fn test_language_prism() {
294 let html = r#"<pre><code class="language-rust">fn main() {}</code></pre>"#;
295 let md = html_to_markdown(html);
296 assert!(md.contains("```rust"));
297 }
298
299 #[test]
300 fn test_language_highlight() {
301 let html = r#"<pre><code class="highlight-python">print()</code></pre>"#;
302 let md = html_to_markdown(html);
303 assert!(md.contains("```python"));
304 }
305
306 #[test]
307 fn test_language_pandoc() {
308 let html = r#"<pre><code class="sourceCode javascript">const x = 1;</code></pre>"#;
309 let md = html_to_markdown(html);
310 assert!(md.contains("```javascript"));
311 }
312
313 #[test]
314 fn test_language_direct_class() {
315 let html = r#"<pre><code class="rust">fn main() {}</code></pre>"#;
316 let md = html_to_markdown(html);
317 assert!(md.contains("```rust"));
318 }
319
320 #[test]
323 fn test_empty_input() {
324 let md = html_to_markdown("");
325 assert_eq!(md, "");
326 }
327
328 #[test]
329 fn test_whitespace_only_input() {
330 let md = html_to_markdown(" \n\t ");
331 assert_eq!(md, "");
332 }
333
334 #[test]
335 fn test_backticks_in_inline_code() {
336 let html = "<code>x `y` z</code>";
337 let md = html_to_markdown(html);
338 assert!(md.contains("``"));
340 assert!(md.contains("x `y` z"));
341 }
342
343 #[test]
344 fn test_multiple_backticks_in_inline_code() {
345 let html = "<code>a ``b`` c</code>";
346 let md = html_to_markdown(html);
347 assert!(md.contains("```"));
349 }
350
351 #[test]
352 fn test_parentheses_in_url() {
353 let html = r#"<a href="https://en.wikipedia.org/wiki/Foo_(bar)">Link</a>"#;
354 let md = html_to_markdown(html);
355 assert!(md.contains("%28"));
356 assert!(md.contains("%29"));
357 assert!(!md.contains("Foo_(bar)"));
358 }
359
360 #[test]
361 fn test_empty_link_no_href() {
362 let html = "<a>just text</a>";
363 let md = html_to_markdown(html);
364 assert_eq!(md, "just text\n");
365 assert!(!md.contains("["));
366 }
367
368 #[test]
369 fn test_empty_image_no_src() {
370 let html = "<p>before<img>after</p>";
371 let md = html_to_markdown(html);
372 assert!(!md.contains("![")); assert!(md.contains("beforeafter"));
374 }
375
376 #[test]
377 fn test_empty_emphasis() {
378 let html = "<p>a<strong></strong>b<em></em>c</p>";
379 let md = html_to_markdown(html);
380 assert!(!md.contains("****"));
382 assert!(!md.contains("**"));
383 assert_eq!(md, "abc\n");
384 }
385
386 #[test]
387 fn test_nested_blockquote() {
388 let html = "<blockquote><blockquote>nested quote</blockquote></blockquote>";
389 let md = html_to_markdown(html);
390 assert!(md.contains("> > nested quote"));
391 }
392
393 #[test]
394 fn test_blockquote_with_formatting() {
395 let html = "<blockquote><p><strong>bold</strong> text</p></blockquote>";
396 let md = html_to_markdown(html);
397 assert!(md.contains("> **bold** text"));
398 }
399
400 #[test]
401 fn test_max_heading_level() {
402 let options = MarkdownOptions::new().max_heading_level(2);
403 let html = "<h1>H1</h1><h2>H2</h2><h3>H3</h3>";
404 let md = html_to_markdown_with_options(html, &options);
405 assert!(md.contains("# H1"));
406 assert!(md.contains("## H2"));
407 assert!(!md.contains("### H3"));
409 assert!(md.contains("H3"));
410 }
411
412 #[test]
413 fn test_unicode_content() {
414 let html = "<p>Hello 世界 🌍 مرحبا</p>";
415 let md = html_to_markdown(html);
416 assert!(md.contains("世界"));
417 assert!(md.contains("🌍"));
418 assert!(md.contains("مرحبا"));
419 }
420
421 #[test]
422 fn test_html_entities() {
423 let html = "<p>& < > "</p>";
424 let md = html_to_markdown(html);
425 assert!(md.contains("&"));
426 assert!(md.contains("<"));
427 assert!(md.contains(">"));
428 }
429
430 #[test]
431 fn test_all_options_disabled() {
432 let options = MarkdownOptions::new()
433 .preserve_headings(false)
434 .include_links(false)
435 .include_images(false)
436 .preserve_emphasis(false)
437 .preserve_strikethrough(false)
438 .preserve_lists(false)
439 .preserve_code(false)
440 .preserve_blockquotes(false)
441 .preserve_tables(false);
442
443 let html = "<h1>Title</h1><p><strong>bold</strong></p><a href='#'>link</a>";
444 let md = html_to_markdown_with_options(html, &options);
445
446 assert!(!md.contains("#"));
448 assert!(!md.contains("**"));
449 assert!(!md.contains("["));
450 assert!(md.contains("Title"));
451 assert!(md.contains("bold"));
452 assert!(md.contains("link"));
453 }
454
455 #[test]
458 fn test_options_builder() {
459 let options = MarkdownOptions::new()
460 .preserve_headings(false)
461 .include_links(false)
462 .max_heading_level(3);
463
464 assert!(!options.preserve_headings);
465 assert!(!options.include_links);
466 assert_eq!(options.max_heading_level, 3);
467 }
468
469 #[test]
470 fn test_max_heading_level_clamping() {
471 let options = MarkdownOptions::new().max_heading_level(10);
472 assert_eq!(options.max_heading_level, 6);
473
474 let options = MarkdownOptions::new().max_heading_level(0);
475 assert_eq!(options.max_heading_level, 1);
476 }
477
478 #[test]
481 fn test_url_resolution_relative_path() {
482 let options = MarkdownOptions::new().base_url("https://example.com/docs/page.html");
483 let html = r#"<a href="images/logo.png">Logo</a>"#;
484 let md = html_to_markdown_with_options(html, &options);
485 assert!(md.contains("https://example.com/docs/images/logo.png"));
486 }
487
488 #[test]
489 fn test_url_resolution_absolute_path() {
490 let options = MarkdownOptions::new().base_url("https://example.com/docs/page.html");
491 let html = r#"<a href="/assets/style.css">Style</a>"#;
492 let md = html_to_markdown_with_options(html, &options);
493 assert!(md.contains("https://example.com/assets/style.css"));
494 }
495
496 #[test]
497 fn test_url_resolution_already_absolute() {
498 let options = MarkdownOptions::new().base_url("https://example.com");
499 let html = r#"<a href="https://other.com/page">Link</a>"#;
500 let md = html_to_markdown_with_options(html, &options);
501 assert!(md.contains("https://other.com/page"));
502 }
503
504 #[test]
505 fn test_url_resolution_image() {
506 let options = MarkdownOptions::new().base_url("https://example.com/docs/");
507 let html = r#"<img src="img/photo.jpg" alt="Photo">"#;
508 let md = html_to_markdown_with_options(html, &options);
509 assert!(md.contains("https://example.com/docs/img/photo.jpg"));
510 }
511
512 #[test]
515 fn test_commonmark_mode_list_indentation() {
516 let options = MarkdownOptions::commonmark();
517 let html = "<ul><li>parent<ul><li>child</li></ul></li></ul>";
518 let md = html_to_markdown_with_options(html, &options);
519 assert!(md.contains(" - child"));
521 }
522
523 #[test]
524 fn test_commonmark_mode_no_strikethrough() {
525 let options = MarkdownOptions::commonmark();
526 let html = "<p><del>deleted</del></p>";
527 let md = html_to_markdown_with_options(html, &options);
528 assert!(!md.contains("~~"));
530 assert!(md.contains("deleted"));
531 }
532
533 #[test]
534 fn test_commonmark_mode_no_tables() {
535 let options = MarkdownOptions::commonmark();
536 let html = "<table><tr><th>A</th></tr><tr><td>1</td></tr></table>";
537 let md = html_to_markdown_with_options(html, &options);
538 assert!(!md.contains("|"));
540 assert!(md.contains("A"));
541 assert!(md.contains("1"));
542 }
543
544 #[test]
545 fn test_gfm_mode_strikethrough() {
546 let options = MarkdownOptions::new(); let html = "<p><del>deleted</del></p>";
548 let md = html_to_markdown_with_options(html, &options);
549 assert!(md.contains("~~deleted~~"));
550 }
551
552 #[test]
555 fn test_escape_special_chars_enabled() {
556 let options = MarkdownOptions::new().escape_special_chars(true);
557 let html = "<p>Hello *world* and _underscores_</p>";
558 let md = html_to_markdown_with_options(html, &options);
559 assert!(md.contains(r"\*world\*"));
560 assert!(md.contains(r"\_underscores\_"));
561 }
562
563 #[test]
564 fn test_escape_special_chars_disabled() {
565 let options = MarkdownOptions::new().escape_special_chars(false);
566 let html = "<p>Hello *world*</p>";
567 let md = html_to_markdown_with_options(html, &options);
568 assert!(md.contains("*world*"));
570 assert!(!md.contains(r"\*"));
571 }
572
573 #[test]
574 fn test_commonmark_escapes_by_default() {
575 let options = MarkdownOptions::commonmark();
576 let html = "<p>Use [brackets] carefully</p>";
577 let md = html_to_markdown_with_options(html, &options);
578 assert!(md.contains(r"\[brackets\]"));
579 }
580
581 #[test]
584 fn test_image_with_dimensions_commonmark() {
585 let options = MarkdownOptions::commonmark();
586 let html = r#"<img src="photo.jpg" alt="Photo" width="200" height="100">"#;
587 let md = html_to_markdown_with_options(html, &options);
588 assert!(md.contains("<img"));
590 assert!(md.contains("width=\"200\""));
591 assert!(md.contains("height=\"100\""));
592 }
593
594 #[test]
595 fn test_image_without_dimensions_commonmark() {
596 let options = MarkdownOptions::commonmark();
597 let html = r#"<img src="photo.jpg" alt="Photo">"#;
598 let md = html_to_markdown_with_options(html, &options);
599 assert!(md.contains(""));
601 }
602
603 #[test]
604 fn test_image_with_dimensions_gfm() {
605 let options = MarkdownOptions::new(); let html = r#"<img src="photo.jpg" alt="Photo" width="200">"#;
607 let md = html_to_markdown_with_options(html, &options);
608 assert!(md.contains(""));
610 assert!(!md.contains("<img"));
611 }
612}