html_to_markdown_rs/
options.rs

1//! Configuration options for HTML to Markdown conversion.
2
3/// Heading style options.
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum HeadingStyle {
6    /// Underlined style (=== for h1, --- for h2)
7    Underlined,
8    /// ATX style (# for h1, ## for h2, etc.)
9    Atx,
10    /// ATX closed style (# title #)
11    AtxClosed,
12}
13
14impl Default for HeadingStyle {
15    fn default() -> Self {
16        Self::Atx
17    }
18}
19
20/// List indentation type.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ListIndentType {
23    Spaces,
24    Tabs,
25}
26
27impl Default for ListIndentType {
28    fn default() -> Self {
29        Self::Spaces
30    }
31}
32
33/// Whitespace handling mode.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum WhitespaceMode {
36    Normalized,
37    Strict,
38}
39
40impl Default for WhitespaceMode {
41    fn default() -> Self {
42        Self::Normalized
43    }
44}
45
46/// Newline style.
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum NewlineStyle {
49    /// Two spaces at end of line
50    Spaces,
51    /// Backslash at end of line
52    Backslash,
53}
54
55impl Default for NewlineStyle {
56    fn default() -> Self {
57        Self::Spaces
58    }
59}
60
61/// Code block style.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum CodeBlockStyle {
64    /// Indented code blocks (4 spaces) - CommonMark default
65    Indented,
66    /// Fenced code blocks with backticks (```)
67    Backticks,
68    /// Fenced code blocks with tildes (~~~)
69    Tildes,
70}
71
72impl Default for CodeBlockStyle {
73    fn default() -> Self {
74        Self::Indented
75    }
76}
77
78/// Highlight style for `<mark>` elements.
79#[derive(Debug, Clone, Copy, PartialEq, Eq)]
80pub enum HighlightStyle {
81    /// ==text==
82    DoubleEqual,
83    /// <mark>text</mark>
84    Html,
85    /// **text**
86    Bold,
87    /// Plain text (no formatting)
88    None,
89}
90
91impl Default for HighlightStyle {
92    fn default() -> Self {
93        Self::DoubleEqual
94    }
95}
96
97/// Preprocessing preset levels.
98#[derive(Debug, Clone, Copy, PartialEq, Eq)]
99pub enum PreprocessingPreset {
100    Minimal,
101    Standard,
102    Aggressive,
103}
104
105impl Default for PreprocessingPreset {
106    fn default() -> Self {
107        Self::Standard
108    }
109}
110
111/// Main conversion options.
112#[derive(Debug, Clone)]
113pub struct ConversionOptions {
114    /// Heading style
115    pub heading_style: HeadingStyle,
116
117    /// List indentation type
118    pub list_indent_type: ListIndentType,
119
120    /// List indentation width (spaces)
121    pub list_indent_width: usize,
122
123    /// Bullet characters for unordered lists
124    pub bullets: String,
125
126    /// Symbol for strong/emphasis (* or _)
127    pub strong_em_symbol: char,
128
129    /// Escape asterisks in text
130    pub escape_asterisks: bool,
131
132    /// Escape underscores in text
133    pub escape_underscores: bool,
134
135    /// Escape misc markdown characters
136    pub escape_misc: bool,
137
138    /// Escape all ASCII punctuation (for CommonMark spec compliance tests)
139    pub escape_ascii: bool,
140
141    /// Default code language
142    pub code_language: String,
143
144    /// Use autolinks for bare URLs
145    pub autolinks: bool,
146
147    /// Add default title if none exists
148    pub default_title: bool,
149
150    /// Use <br> in tables instead of spaces
151    pub br_in_tables: bool,
152
153    /// Enable spatial table reconstruction in hOCR documents
154    pub hocr_spatial_tables: bool,
155
156    /// Highlight style for <mark> elements
157    pub highlight_style: HighlightStyle,
158
159    /// Extract metadata from HTML
160    pub extract_metadata: bool,
161
162    /// Whitespace handling mode
163    pub whitespace_mode: WhitespaceMode,
164
165    /// Strip newlines from HTML before processing
166    pub strip_newlines: bool,
167
168    /// Enable text wrapping
169    pub wrap: bool,
170
171    /// Text wrap width
172    pub wrap_width: usize,
173
174    /// Treat block elements as inline
175    pub convert_as_inline: bool,
176
177    /// Subscript symbol
178    pub sub_symbol: String,
179
180    /// Superscript symbol
181    pub sup_symbol: String,
182
183    /// Newline style
184    pub newline_style: NewlineStyle,
185
186    /// Code block style
187    pub code_block_style: CodeBlockStyle,
188
189    /// Elements where images should remain as markdown (not converted to alt text)
190    pub keep_inline_images_in: Vec<String>,
191
192    /// Preprocessing options
193    pub preprocessing: PreprocessingOptions,
194
195    /// Source encoding (informational)
196    pub encoding: String,
197
198    /// Enable debug mode with diagnostic warnings
199    pub debug: bool,
200
201    /// List of HTML tags to strip (output only text content, no markdown conversion)
202    pub strip_tags: Vec<String>,
203
204    /// List of HTML tags to preserve as-is in the output (keep original HTML)
205    /// Useful for complex elements like tables that don't convert well to Markdown
206    pub preserve_tags: Vec<String>,
207}
208
209impl Default for ConversionOptions {
210    fn default() -> Self {
211        Self {
212            heading_style: HeadingStyle::default(),
213            list_indent_type: ListIndentType::default(),
214            list_indent_width: 2,
215            bullets: "-".to_string(),
216            strong_em_symbol: '*',
217            escape_asterisks: false,
218            escape_underscores: false,
219            escape_misc: false,
220            escape_ascii: false,
221            code_language: String::new(),
222            autolinks: true,
223            default_title: false,
224            br_in_tables: false,
225            hocr_spatial_tables: true,
226            highlight_style: HighlightStyle::default(),
227            extract_metadata: true,
228            whitespace_mode: WhitespaceMode::default(),
229            strip_newlines: false,
230            wrap: false,
231            wrap_width: 80,
232            convert_as_inline: false,
233            sub_symbol: String::new(),
234            sup_symbol: String::new(),
235            newline_style: NewlineStyle::Spaces,
236            code_block_style: CodeBlockStyle::default(),
237            keep_inline_images_in: Vec::new(),
238            preprocessing: PreprocessingOptions::default(),
239            encoding: "utf-8".to_string(),
240            debug: false,
241            strip_tags: Vec::new(),
242            preserve_tags: Vec::new(),
243        }
244    }
245}
246
247/// HTML preprocessing options.
248#[derive(Debug, Clone)]
249pub struct PreprocessingOptions {
250    /// Enable preprocessing
251    pub enabled: bool,
252
253    /// Preprocessing preset
254    pub preset: PreprocessingPreset,
255
256    /// Remove navigation elements
257    pub remove_navigation: bool,
258
259    /// Remove form elements
260    pub remove_forms: bool,
261}
262
263impl Default for PreprocessingOptions {
264    fn default() -> Self {
265        Self {
266            enabled: true,
267            preset: PreprocessingPreset::default(),
268            remove_navigation: true,
269            remove_forms: true,
270        }
271    }
272}