Skip to main content

html_to_markdown_rs/
lib.rs

1#![allow(
2    clippy::too_many_lines,
3    clippy::option_if_let_else,
4    clippy::match_wildcard_for_single_variants,
5    clippy::needless_pass_by_value,
6    clippy::struct_excessive_bools,
7    clippy::fn_params_excessive_bools,
8    clippy::branches_sharing_code,
9    clippy::match_same_arms,
10    clippy::missing_errors_doc,
11    clippy::items_after_statements,
12    clippy::doc_markdown,
13    clippy::cast_sign_loss,
14    clippy::default_trait_access,
15    clippy::unused_self,
16    clippy::cast_precision_loss,
17    clippy::collapsible_if,
18    clippy::too_many_arguments,
19    clippy::collapsible_else_if,
20    clippy::extra_unused_lifetimes,
21    clippy::unnecessary_lazy_evaluations,
22    clippy::must_use_candidate,
23    clippy::trivially_copy_pass_by_ref,
24    clippy::explicit_iter_loop,
25    clippy::missing_const_for_fn,
26    clippy::manual_assert,
27    clippy::return_self_not_must_use,
28    clippy::collapsible_match,
29    clippy::cast_possible_truncation,
30    clippy::map_unwrap_or,
31    clippy::manual_let_else,
32    clippy::used_underscore_binding,
33    clippy::assigning_clones,
34    clippy::uninlined_format_args
35)]
36
37//! High-performance HTML to Markdown converter.
38//!
39//! Built with html5ever for fast, memory-efficient HTML parsing.
40//!
41//! ## Optional inline image extraction
42//!
43//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
44//! assets alongside the produced Markdown.
45
46// ============================================================================
47// Module Declarations
48// ============================================================================
49
50pub mod converter;
51pub mod error;
52#[cfg(feature = "inline-images")]
53mod inline_images;
54#[cfg(feature = "metadata")]
55pub mod metadata;
56pub mod options;
57pub mod safety;
58pub mod text;
59pub mod types;
60#[cfg(feature = "visitor")]
61pub mod visitor;
62#[cfg(feature = "visitor")]
63pub mod visitor_helpers;
64pub mod wrapper;
65
66// Internal modules (not part of public API)
67mod convert_api;
68mod exports;
69pub mod prelude;
70mod rcdom;
71mod validation;
72
73// ============================================================================
74// Public Re-exports (from exports module)
75// ============================================================================
76
77pub use exports::*;
78pub use types::{
79    AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent, ProcessingWarning,
80    TableData, TableGrid, TextAnnotation, WarningKind,
81};
82
83// ============================================================================
84// Main Public API Functions
85// ============================================================================
86
87pub use convert_api::convert;
88
89#[cfg(any(feature = "serde", feature = "metadata"))]
90pub use convert_api::{conversion_options_from_json, conversion_options_update_from_json};
91
92#[cfg(feature = "metadata")]
93pub use convert_api::metadata_config_from_json;
94
95#[cfg(feature = "inline-images")]
96pub use convert_api::inline_image_config_from_json;
97
98#[cfg(feature = "visitor")]
99#[doc(hidden)]
100pub use convert_api::convert_with_visitor;
101
102// Tests
103// ============================================================================
104
105#[cfg(test)]
106mod basic_tests {
107    use super::*;
108
109    #[test]
110    fn test_binary_input_rejected() {
111        let html = format!("abc{}def", "\0".repeat(20));
112        let result = convert(&html, None);
113        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
114    }
115
116    #[test]
117    fn test_binary_magic_rejected() {
118        let html = "%PDF-1.7";
119        let result = convert(html, None);
120        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
121    }
122
123    #[test]
124    fn test_utf16_hint_recovered() {
125        let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
126        let result = convert(&html, None);
127        assert!(result.is_ok(), "UTF-16 input should be recovered instead of rejected");
128    }
129
130    #[test]
131    fn test_plain_text_allowed() {
132        let result = convert("Just text", None).unwrap();
133        let content = result.content.unwrap_or_default();
134        assert!(content.contains("Just text"));
135    }
136
137    #[test]
138    fn test_plain_text_escaped_when_enabled() {
139        let options = ConversionOptions {
140            escape_asterisks: true,
141            escape_underscores: true,
142            ..ConversionOptions::default()
143        };
144        let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
145        let content = result.content.unwrap_or_default();
146        assert!(content.contains(r"\*asterisks\*"));
147        assert!(content.contains(r"\_underscores\_"));
148    }
149}