html_to_markdown_rs/
options.rs

1//! Configuration options for HTML to Markdown conversion.
2
3/// Heading style options.
4#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
5pub enum HeadingStyle {
6    /// Underlined style (=== for h1, --- for h2)
7    Underlined,
8    /// ATX style (# for h1, ## for h2, etc.)
9    #[default]
10    Atx,
11    /// ATX closed style (# title #)
12    AtxClosed,
13}
14
15/// List indentation type.
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
17pub enum ListIndentType {
18    #[default]
19    Spaces,
20    Tabs,
21}
22
23/// Whitespace handling mode.
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
25pub enum WhitespaceMode {
26    #[default]
27    Normalized,
28    Strict,
29}
30
31/// Newline style.
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
33pub enum NewlineStyle {
34    /// Two spaces at end of line
35    #[default]
36    Spaces,
37    /// Backslash at end of line
38    Backslash,
39}
40
41/// Code block style.
42#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
43pub enum CodeBlockStyle {
44    /// Indented code blocks (4 spaces) - CommonMark default
45    #[default]
46    Indented,
47    /// Fenced code blocks with backticks (```)
48    Backticks,
49    /// Fenced code blocks with tildes (~~~)
50    Tildes,
51}
52
53/// Highlight style for `<mark>` elements.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
55pub enum HighlightStyle {
56    /// ==text==
57    #[default]
58    DoubleEqual,
59    /// <mark>text</mark>
60    Html,
61    /// **text**
62    Bold,
63    /// Plain text (no formatting)
64    None,
65}
66
67/// Preprocessing preset levels.
68#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
69pub enum PreprocessingPreset {
70    Minimal,
71    #[default]
72    Standard,
73    Aggressive,
74}
75
76/// Main conversion options.
77#[derive(Debug, Clone)]
78pub struct ConversionOptions {
79    /// Heading style
80    pub heading_style: HeadingStyle,
81
82    /// List indentation type
83    pub list_indent_type: ListIndentType,
84
85    /// List indentation width (spaces)
86    pub list_indent_width: usize,
87
88    /// Bullet characters for unordered lists
89    pub bullets: String,
90
91    /// Symbol for strong/emphasis (* or _)
92    pub strong_em_symbol: char,
93
94    /// Escape asterisks in text
95    pub escape_asterisks: bool,
96
97    /// Escape underscores in text
98    pub escape_underscores: bool,
99
100    /// Escape misc markdown characters
101    pub escape_misc: bool,
102
103    /// Escape all ASCII punctuation (for CommonMark spec compliance tests)
104    pub escape_ascii: bool,
105
106    /// Default code language
107    pub code_language: String,
108
109    /// Use autolinks for bare URLs
110    pub autolinks: bool,
111
112    /// Add default title if none exists
113    pub default_title: bool,
114
115    /// Use <br> in tables instead of spaces
116    pub br_in_tables: bool,
117
118    /// Enable spatial table reconstruction in hOCR documents
119    pub hocr_spatial_tables: bool,
120
121    /// Highlight style for <mark> elements
122    pub highlight_style: HighlightStyle,
123
124    /// Extract metadata from HTML
125    pub extract_metadata: bool,
126
127    /// Whitespace handling mode
128    pub whitespace_mode: WhitespaceMode,
129
130    /// Strip newlines from HTML before processing
131    pub strip_newlines: bool,
132
133    /// Enable text wrapping
134    pub wrap: bool,
135
136    /// Text wrap width
137    pub wrap_width: usize,
138
139    /// Treat block elements as inline
140    pub convert_as_inline: bool,
141
142    /// Subscript symbol
143    pub sub_symbol: String,
144
145    /// Superscript symbol
146    pub sup_symbol: String,
147
148    /// Newline style
149    pub newline_style: NewlineStyle,
150
151    /// Code block style
152    pub code_block_style: CodeBlockStyle,
153
154    /// Elements where images should remain as markdown (not converted to alt text)
155    pub keep_inline_images_in: Vec<String>,
156
157    /// Preprocessing options
158    pub preprocessing: PreprocessingOptions,
159
160    /// Source encoding (informational)
161    pub encoding: String,
162
163    /// Enable debug mode with diagnostic warnings
164    pub debug: bool,
165
166    /// List of HTML tags to strip (output only text content, no markdown conversion)
167    pub strip_tags: Vec<String>,
168
169    /// List of HTML tags to preserve as-is in the output (keep original HTML)
170    /// Useful for complex elements like tables that don't convert well to Markdown
171    pub preserve_tags: Vec<String>,
172}
173
174impl Default for ConversionOptions {
175    fn default() -> Self {
176        Self {
177            heading_style: HeadingStyle::default(),
178            list_indent_type: ListIndentType::default(),
179            list_indent_width: 2,
180            bullets: "-".to_string(),
181            strong_em_symbol: '*',
182            escape_asterisks: false,
183            escape_underscores: false,
184            escape_misc: false,
185            escape_ascii: false,
186            code_language: String::new(),
187            autolinks: true,
188            default_title: false,
189            br_in_tables: false,
190            hocr_spatial_tables: true,
191            highlight_style: HighlightStyle::default(),
192            extract_metadata: true,
193            whitespace_mode: WhitespaceMode::default(),
194            strip_newlines: false,
195            wrap: false,
196            wrap_width: 80,
197            convert_as_inline: false,
198            sub_symbol: String::new(),
199            sup_symbol: String::new(),
200            newline_style: NewlineStyle::Spaces,
201            code_block_style: CodeBlockStyle::default(),
202            keep_inline_images_in: Vec::new(),
203            preprocessing: PreprocessingOptions::default(),
204            encoding: "utf-8".to_string(),
205            debug: false,
206            strip_tags: Vec::new(),
207            preserve_tags: Vec::new(),
208        }
209    }
210}
211
212/// HTML preprocessing options.
213#[derive(Debug, Clone)]
214pub struct PreprocessingOptions {
215    /// Enable preprocessing
216    pub enabled: bool,
217
218    /// Preprocessing preset
219    pub preset: PreprocessingPreset,
220
221    /// Remove navigation elements
222    pub remove_navigation: bool,
223
224    /// Remove form elements
225    pub remove_forms: bool,
226}
227
228impl Default for PreprocessingOptions {
229    fn default() -> Self {
230        Self {
231            enabled: true,
232            preset: PreprocessingPreset::default(),
233            remove_navigation: true,
234            remove_forms: true,
235        }
236    }
237}