Skip to main content

quick_html2md/
lib.rs

1//! Fast HTML to Markdown conversion with GFM support.
2//!
3//! # Features
4//!
5//! - **Headings**: `<h1>`-`<h6>` -> `#`-`######`
6//! - **Emphasis**: `<strong>`/`<b>` -> `**bold**`, `<em>`/`<i>` -> `*italic*`
7//! - **Strikethrough**: `<del>`/`<s>` -> `~~struck~~`
8//! - **Lists**: `<ul>`/`<ol>` with proper nesting and indentation
9//! - **Links**: `<a href="">` -> `[text](url)`
10//! - **Images**: `<img>` -> `![alt](src)`
11//! - **Code**: `<code>` -> `` `inline` ``, `<pre><code>` -> fenced blocks with language
12//! - **Tables**: Full GFM table support with alignment
13//! - **Blockquotes**: `<blockquote>` -> `> quote`
14//!
15//! # Quick Start
16//!
17//! ```
18//! use quick_html2md::html_to_markdown;
19//!
20//! let html = "<h1>Hello</h1><p>World</p>";
21//! let md = html_to_markdown(html);
22//! assert!(md.contains("# Hello"));
23//! assert!(md.contains("World"));
24//! ```
25//!
26//! # With Options
27//!
28//! ```
29//! use quick_html2md::{html_to_markdown_with_options, MarkdownOptions};
30//!
31//! let options = MarkdownOptions::new()
32//!     .include_links(false)
33//!     .preserve_tables(true);
34//!
35//! let html = "<p><a href='url'>link</a></p>";
36//! let md = html_to_markdown_with_options(html, &options);
37//! ```
38
39mod converter;
40mod elements;
41pub mod options;
42mod utils;
43
44pub use options::MarkdownOptions;
45
46use dom_query::{Document, Selection};
47
48use converter::convert_node;
49use utils::normalize_output;
50
51/// Convert HTML document to Markdown.
52///
53/// Uses default options.
54///
55/// # Example
56///
57/// ```
58/// use quick_html2md::to_markdown;
59/// use dom_query::Document;
60///
61/// let doc = Document::from("<h1>Title</h1><p>Content</p>");
62/// let md = to_markdown(&doc);
63/// assert!(md.contains("# Title"));
64/// ```
65#[must_use]
66pub fn to_markdown(doc: &Document) -> String {
67    to_markdown_with_options(doc, &MarkdownOptions::new())
68}
69
70/// Convert HTML document to Markdown with options.
71#[must_use]
72pub fn to_markdown_with_options(doc: &Document, options: &MarkdownOptions) -> String {
73    let mut output = String::new();
74    let body = doc.select("body");
75
76    if body.exists() {
77        for node in body.children().nodes() {
78            let sel = Selection::from(*node);
79            convert_node(&sel, &mut output, options, 0);
80        }
81    } else {
82        // No body, process root
83        for node in doc.select("*").first().children().nodes() {
84            let sel = Selection::from(*node);
85            convert_node(&sel, &mut output, options, 0);
86        }
87    }
88
89    normalize_output(&output)
90}
91
92/// Convert HTML element to Markdown.
93#[must_use]
94pub fn element_to_markdown(element: &Selection) -> String {
95    element_to_markdown_with_options(element, &MarkdownOptions::new())
96}
97
98/// Convert HTML element to Markdown with options.
99#[must_use]
100pub fn element_to_markdown_with_options(element: &Selection, options: &MarkdownOptions) -> String {
101    let mut output = String::new();
102    convert_node(element, &mut output, options, 0);
103    normalize_output(&output)
104}
105
106/// Convert HTML string to Markdown.
107///
108/// # Example
109///
110/// ```
111/// use quick_html2md::html_to_markdown;
112///
113/// let md = html_to_markdown("<h1>Title</h1><p>Content</p>");
114/// assert!(md.contains("# Title"));
115/// assert!(md.contains("Content"));
116/// ```
117#[must_use]
118pub fn html_to_markdown(html: &str) -> String {
119    let doc = Document::from(html);
120    to_markdown(&doc)
121}
122
123/// Convert HTML string to Markdown with options.
124#[must_use]
125pub fn html_to_markdown_with_options(html: &str, options: &MarkdownOptions) -> String {
126    let doc = Document::from(html);
127    to_markdown_with_options(&doc, options)
128}
129
130#[cfg(test)]
131mod tests {
132    use super::*;
133    use pretty_assertions::assert_eq;
134
135    // ==================== Basic Element Tests ====================
136
137    #[test]
138    fn test_heading_conversion() {
139        let md = html_to_markdown("<h1>Title</h1><h2>Subtitle</h2>");
140        assert_eq!(md, "# Title\n\n## Subtitle\n");
141    }
142
143    #[test]
144    fn test_all_heading_levels() {
145        let html = "<h1>H1</h1><h2>H2</h2><h3>H3</h3><h4>H4</h4><h5>H5</h5><h6>H6</h6>";
146        let md = html_to_markdown(html);
147        assert!(md.contains("# H1"));
148        assert!(md.contains("## H2"));
149        assert!(md.contains("### H3"));
150        assert!(md.contains("#### H4"));
151        assert!(md.contains("##### H5"));
152        assert!(md.contains("###### H6"));
153    }
154
155    #[test]
156    fn test_paragraph() {
157        let md = html_to_markdown("<p>Hello world</p>");
158        assert_eq!(md, "Hello world\n");
159    }
160
161    #[test]
162    fn test_bold_and_italic() {
163        let md = html_to_markdown("<p><strong>bold</strong> and <em>italic</em></p>");
164        assert_eq!(md, "**bold** and *italic*\n");
165    }
166
167    #[test]
168    fn test_strikethrough() {
169        let md = html_to_markdown("<p><del>deleted</del> and <s>struck</s></p>");
170        assert_eq!(md, "~~deleted~~ and ~~struck~~\n");
171    }
172
173    #[test]
174    fn test_links() {
175        let md = html_to_markdown(r#"<a href="https://example.com">Link</a>"#);
176        assert_eq!(md, "[Link](https://example.com)\n");
177    }
178
179    #[test]
180    fn test_unordered_list() {
181        let md = html_to_markdown("<ul><li>One</li><li>Two</li></ul>");
182        assert_eq!(md, "- One\n- Two\n");
183    }
184
185    #[test]
186    fn test_ordered_list() {
187        let md = html_to_markdown("<ol><li>First</li><li>Second</li></ol>");
188        assert_eq!(md, "1. First\n2. Second\n");
189    }
190
191    #[test]
192    fn test_code_block() {
193        let md = html_to_markdown("<pre><code>fn main() {}</code></pre>");
194        assert!(md.contains("```"));
195        assert!(md.contains("fn main()"));
196    }
197
198    #[test]
199    fn test_blockquote() {
200        let md = html_to_markdown("<blockquote>Quote text</blockquote>");
201        assert_eq!(md, "> Quote text\n");
202    }
203
204    // ==================== Nested List Tests ====================
205
206    #[test]
207    fn test_nested_unordered_list() {
208        let html = "<ul><li>parent<ul><li>child</li></ul></li></ul>";
209        let md = html_to_markdown(html);
210        assert_eq!(md, "- parent\n  - child\n");
211    }
212
213    #[test]
214    fn test_deeply_nested_list() {
215        let html = "<ul><li>L1<ul><li>L2<ul><li>L3</li></ul></li></ul></li></ul>";
216        let md = html_to_markdown(html);
217        assert_eq!(md, "- L1\n  - L2\n    - L3\n");
218    }
219
220    #[test]
221    fn test_siblings_with_nested_children() {
222        let html = "<ul><li>A<ul><li>A1</li></ul></li><li>B<ul><li>B1</li></ul></li></ul>";
223        let md = html_to_markdown(html);
224        // Each item should appear exactly once
225        assert_eq!(md.matches("A1").count(), 1);
226        assert_eq!(md.matches("B1").count(), 1);
227        assert_eq!(md, "- A\n  - A1\n- B\n  - B1\n");
228    }
229
230    #[test]
231    fn test_nested_ordered_list() {
232        let html = "<ol><li>first<ol><li>nested</li></ol></li><li>second</li></ol>";
233        let md = html_to_markdown(html);
234        assert!(md.contains("1. first"));
235        assert!(md.contains("  1. nested"));
236        assert!(md.contains("2. second"));
237    }
238
239    #[test]
240    fn test_mixed_nested_lists() {
241        let html = "<ul><li>bullet<ol><li>num 1</li><li>num 2</li></ol></li></ul>";
242        let md = html_to_markdown(html);
243        assert!(md.contains("- bullet"));
244        assert!(md.contains("  1. num 1"));
245        assert!(md.contains("  2. num 2"));
246    }
247
248    #[test]
249    fn test_text_after_nested_list() {
250        let html = "<ul><li>before<ul><li>nested</li></ul>after</li></ul>";
251        let md = html_to_markdown(html);
252        // "after" should appear once
253        assert_eq!(md.matches("after").count(), 1);
254    }
255
256    // ==================== Table Tests ====================
257
258    #[test]
259    fn test_simple_table() {
260        let html = "<table><tr><th>A</th><th>B</th></tr><tr><td>1</td><td>2</td></tr></table>";
261        let md = html_to_markdown(html);
262        assert!(md.contains("| A"));
263        assert!(md.contains("| B"));
264        assert!(md.contains("---"));
265        assert!(md.contains("| 1"));
266        assert!(md.contains("| 2"));
267    }
268
269    #[test]
270    fn test_table_with_pipes_in_content() {
271        let html = "<table><tr><th>Cmd</th></tr><tr><td>a | b</td></tr></table>";
272        let md = html_to_markdown(html);
273        assert!(md.contains(r"a \| b"));
274    }
275
276    #[test]
277    fn test_table_header_with_pipes() {
278        let html = "<table><thead><tr><th>A | B</th></tr></thead><tr><td>1</td></tr></table>";
279        let md = html_to_markdown(html);
280        assert!(md.contains(r"A \| B"));
281    }
282
283    #[test]
284    fn test_empty_table() {
285        let html = "<table></table>";
286        let md = html_to_markdown(html);
287        assert!(!md.contains("|"));
288    }
289
290    // ==================== Code Language Detection Tests ====================
291
292    #[test]
293    fn test_language_prism() {
294        let html = r#"<pre><code class="language-rust">fn main() {}</code></pre>"#;
295        let md = html_to_markdown(html);
296        assert!(md.contains("```rust"));
297    }
298
299    #[test]
300    fn test_language_highlight() {
301        let html = r#"<pre><code class="highlight-python">print()</code></pre>"#;
302        let md = html_to_markdown(html);
303        assert!(md.contains("```python"));
304    }
305
306    #[test]
307    fn test_language_pandoc() {
308        let html = r#"<pre><code class="sourceCode javascript">const x = 1;</code></pre>"#;
309        let md = html_to_markdown(html);
310        assert!(md.contains("```javascript"));
311    }
312
313    #[test]
314    fn test_language_direct_class() {
315        let html = r#"<pre><code class="rust">fn main() {}</code></pre>"#;
316        let md = html_to_markdown(html);
317        assert!(md.contains("```rust"));
318    }
319
320    // ==================== Edge Case Tests ====================
321
322    #[test]
323    fn test_empty_input() {
324        let md = html_to_markdown("");
325        assert_eq!(md, "");
326    }
327
328    #[test]
329    fn test_whitespace_only_input() {
330        let md = html_to_markdown("   \n\t  ");
331        assert_eq!(md, "");
332    }
333
334    #[test]
335    fn test_backticks_in_inline_code() {
336        let html = "<code>x `y` z</code>";
337        let md = html_to_markdown(html);
338        // Should use double backticks to escape
339        assert!(md.contains("``"));
340        assert!(md.contains("x `y` z"));
341    }
342
343    #[test]
344    fn test_multiple_backticks_in_inline_code() {
345        let html = "<code>a ``b`` c</code>";
346        let md = html_to_markdown(html);
347        // Should use triple backticks
348        assert!(md.contains("```"));
349    }
350
351    #[test]
352    fn test_parentheses_in_url() {
353        let html = r#"<a href="https://en.wikipedia.org/wiki/Foo_(bar)">Link</a>"#;
354        let md = html_to_markdown(html);
355        assert!(md.contains("%28"));
356        assert!(md.contains("%29"));
357        assert!(!md.contains("Foo_(bar)"));
358    }
359
360    #[test]
361    fn test_empty_link_no_href() {
362        let html = "<a>just text</a>";
363        let md = html_to_markdown(html);
364        assert_eq!(md, "just text\n");
365        assert!(!md.contains("["));
366    }
367
368    #[test]
369    fn test_empty_image_no_src() {
370        let html = "<p>before<img>after</p>";
371        let md = html_to_markdown(html);
372        assert!(!md.contains("![")); // No image markup
373        assert!(md.contains("beforeafter"));
374    }
375
376    #[test]
377    fn test_empty_emphasis() {
378        let html = "<p>a<strong></strong>b<em></em>c</p>";
379        let md = html_to_markdown(html);
380        // Empty emphasis should be skipped
381        assert!(!md.contains("****"));
382        assert!(!md.contains("**"));
383        assert_eq!(md, "abc\n");
384    }
385
386    #[test]
387    fn test_nested_blockquote() {
388        let html = "<blockquote><blockquote>nested quote</blockquote></blockquote>";
389        let md = html_to_markdown(html);
390        assert!(md.contains("> > nested quote"));
391    }
392
393    #[test]
394    fn test_blockquote_with_formatting() {
395        let html = "<blockquote><p><strong>bold</strong> text</p></blockquote>";
396        let md = html_to_markdown(html);
397        assert!(md.contains("> **bold** text"));
398    }
399
400    #[test]
401    fn test_max_heading_level() {
402        let options = MarkdownOptions::new().max_heading_level(2);
403        let html = "<h1>H1</h1><h2>H2</h2><h3>H3</h3>";
404        let md = html_to_markdown_with_options(html, &options);
405        assert!(md.contains("# H1"));
406        assert!(md.contains("## H2"));
407        // H3 should be converted to paragraph (no ### prefix)
408        assert!(!md.contains("### H3"));
409        assert!(md.contains("H3"));
410    }
411
412    #[test]
413    fn test_unicode_content() {
414        let html = "<p>Hello 世界 🌍 مرحبا</p>";
415        let md = html_to_markdown(html);
416        assert!(md.contains("世界"));
417        assert!(md.contains("🌍"));
418        assert!(md.contains("مرحبا"));
419    }
420
421    #[test]
422    fn test_html_entities() {
423        let html = "<p>&amp; &lt; &gt; &quot;</p>";
424        let md = html_to_markdown(html);
425        assert!(md.contains("&"));
426        assert!(md.contains("<"));
427        assert!(md.contains(">"));
428    }
429
430    #[test]
431    fn test_all_options_disabled() {
432        let options = MarkdownOptions::new()
433            .preserve_headings(false)
434            .include_links(false)
435            .include_images(false)
436            .preserve_emphasis(false)
437            .preserve_strikethrough(false)
438            .preserve_lists(false)
439            .preserve_code(false)
440            .preserve_blockquotes(false)
441            .preserve_tables(false);
442
443        let html = "<h1>Title</h1><p><strong>bold</strong></p><a href='#'>link</a>";
444        let md = html_to_markdown_with_options(html, &options);
445
446        // Should just contain text, no markdown formatting
447        assert!(!md.contains("#"));
448        assert!(!md.contains("**"));
449        assert!(!md.contains("["));
450        assert!(md.contains("Title"));
451        assert!(md.contains("bold"));
452        assert!(md.contains("link"));
453    }
454
455    // ==================== Builder Pattern Tests ====================
456
457    #[test]
458    fn test_options_builder() {
459        let options = MarkdownOptions::new()
460            .preserve_headings(false)
461            .include_links(false)
462            .max_heading_level(3);
463
464        assert!(!options.preserve_headings);
465        assert!(!options.include_links);
466        assert_eq!(options.max_heading_level, 3);
467    }
468
469    #[test]
470    fn test_max_heading_level_clamping() {
471        let options = MarkdownOptions::new().max_heading_level(10);
472        assert_eq!(options.max_heading_level, 6);
473
474        let options = MarkdownOptions::new().max_heading_level(0);
475        assert_eq!(options.max_heading_level, 1);
476    }
477
478    // ==================== URL Resolution Tests ====================
479
480    #[test]
481    fn test_url_resolution_relative_path() {
482        let options = MarkdownOptions::new().base_url("https://example.com/docs/page.html");
483        let html = r#"<a href="images/logo.png">Logo</a>"#;
484        let md = html_to_markdown_with_options(html, &options);
485        assert!(md.contains("https://example.com/docs/images/logo.png"));
486    }
487
488    #[test]
489    fn test_url_resolution_absolute_path() {
490        let options = MarkdownOptions::new().base_url("https://example.com/docs/page.html");
491        let html = r#"<a href="/assets/style.css">Style</a>"#;
492        let md = html_to_markdown_with_options(html, &options);
493        assert!(md.contains("https://example.com/assets/style.css"));
494    }
495
496    #[test]
497    fn test_url_resolution_already_absolute() {
498        let options = MarkdownOptions::new().base_url("https://example.com");
499        let html = r#"<a href="https://other.com/page">Link</a>"#;
500        let md = html_to_markdown_with_options(html, &options);
501        assert!(md.contains("https://other.com/page"));
502    }
503
504    #[test]
505    fn test_url_resolution_image() {
506        let options = MarkdownOptions::new().base_url("https://example.com/docs/");
507        let html = r#"<img src="img/photo.jpg" alt="Photo">"#;
508        let md = html_to_markdown_with_options(html, &options);
509        assert!(md.contains("https://example.com/docs/img/photo.jpg"));
510    }
511
512    // ==================== CommonMark Mode Tests ====================
513
514    #[test]
515    fn test_commonmark_mode_list_indentation() {
516        let options = MarkdownOptions::commonmark();
517        let html = "<ul><li>parent<ul><li>child</li></ul></li></ul>";
518        let md = html_to_markdown_with_options(html, &options);
519        // CommonMark uses 4-space indentation
520        assert!(md.contains("    - child"));
521    }
522
523    #[test]
524    fn test_commonmark_mode_no_strikethrough() {
525        let options = MarkdownOptions::commonmark();
526        let html = "<p><del>deleted</del></p>";
527        let md = html_to_markdown_with_options(html, &options);
528        // Strikethrough is GFM extension, should be disabled
529        assert!(!md.contains("~~"));
530        assert!(md.contains("deleted"));
531    }
532
533    #[test]
534    fn test_commonmark_mode_no_tables() {
535        let options = MarkdownOptions::commonmark();
536        let html = "<table><tr><th>A</th></tr><tr><td>1</td></tr></table>";
537        let md = html_to_markdown_with_options(html, &options);
538        // Tables are GFM extension, should be disabled
539        assert!(!md.contains("|"));
540        assert!(md.contains("A"));
541        assert!(md.contains("1"));
542    }
543
544    #[test]
545    fn test_gfm_mode_strikethrough() {
546        let options = MarkdownOptions::new(); // GFM mode by default
547        let html = "<p><del>deleted</del></p>";
548        let md = html_to_markdown_with_options(html, &options);
549        assert!(md.contains("~~deleted~~"));
550    }
551
552    // ==================== Markdown Escaping Tests ====================
553
554    #[test]
555    fn test_escape_special_chars_enabled() {
556        let options = MarkdownOptions::new().escape_special_chars(true);
557        let html = "<p>Hello *world* and _underscores_</p>";
558        let md = html_to_markdown_with_options(html, &options);
559        assert!(md.contains(r"\*world\*"));
560        assert!(md.contains(r"\_underscores\_"));
561    }
562
563    #[test]
564    fn test_escape_special_chars_disabled() {
565        let options = MarkdownOptions::new().escape_special_chars(false);
566        let html = "<p>Hello *world*</p>";
567        let md = html_to_markdown_with_options(html, &options);
568        // Should not escape
569        assert!(md.contains("*world*"));
570        assert!(!md.contains(r"\*"));
571    }
572
573    #[test]
574    fn test_commonmark_escapes_by_default() {
575        let options = MarkdownOptions::commonmark();
576        let html = "<p>Use [brackets] carefully</p>";
577        let md = html_to_markdown_with_options(html, &options);
578        assert!(md.contains(r"\[brackets\]"));
579    }
580
581    // ==================== Image Dimension Handling Tests ====================
582
583    #[test]
584    fn test_image_with_dimensions_commonmark() {
585        let options = MarkdownOptions::commonmark();
586        let html = r#"<img src="photo.jpg" alt="Photo" width="200" height="100">"#;
587        let md = html_to_markdown_with_options(html, &options);
588        // CommonMark mode with dimensions should output HTML
589        assert!(md.contains("<img"));
590        assert!(md.contains("width=\"200\""));
591        assert!(md.contains("height=\"100\""));
592    }
593
594    #[test]
595    fn test_image_without_dimensions_commonmark() {
596        let options = MarkdownOptions::commonmark();
597        let html = r#"<img src="photo.jpg" alt="Photo">"#;
598        let md = html_to_markdown_with_options(html, &options);
599        // Without dimensions, use standard markdown
600        assert!(md.contains("![Photo](photo.jpg)"));
601    }
602
603    #[test]
604    fn test_image_with_dimensions_gfm() {
605        let options = MarkdownOptions::new(); // GFM mode
606        let html = r#"<img src="photo.jpg" alt="Photo" width="200">"#;
607        let md = html_to_markdown_with_options(html, &options);
608        // GFM mode uses standard markdown (dimensions ignored)
609        assert!(md.contains("![Photo](photo.jpg)"));
610        assert!(!md.contains("<img"));
611    }
612}