html_to_markdown_rs/
options.rs

1//! Configuration options for HTML to Markdown conversion.
2
3/// Heading style options.
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum HeadingStyle {
6    /// Underlined style (=== for h1, --- for h2)
7    Underlined,
8    /// ATX style (# for h1, ## for h2, etc.)
9    Atx,
10    /// ATX closed style (# title #)
11    AtxClosed,
12}
13
14impl Default for HeadingStyle {
15    fn default() -> Self {
16        Self::Atx
17    }
18}
19
20/// List indentation type.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ListIndentType {
23    Spaces,
24    Tabs,
25}
26
27impl Default for ListIndentType {
28    fn default() -> Self {
29        Self::Spaces
30    }
31}
32
33/// Whitespace handling mode.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum WhitespaceMode {
36    Normalized,
37    Strict,
38}
39
40impl Default for WhitespaceMode {
41    fn default() -> Self {
42        Self::Normalized
43    }
44}
45
46/// Newline style.
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum NewlineStyle {
49    /// Two spaces at end of line
50    Spaces,
51    /// Backslash at end of line
52    Backslash,
53}
54
55impl Default for NewlineStyle {
56    fn default() -> Self {
57        Self::Spaces
58    }
59}
60
61/// Code block style.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum CodeBlockStyle {
64    /// Indented code blocks (4 spaces) - CommonMark default
65    Indented,
66    /// Fenced code blocks with backticks (```)
67    Backticks,
68    /// Fenced code blocks with tildes (~~~)
69    Tildes,
70}
71
72impl Default for CodeBlockStyle {
73    fn default() -> Self {
74        Self::Indented
75    }
76}
77
78/// Highlight style for `<mark>` elements.
79#[derive(Debug, Clone, Copy, PartialEq, Eq)]
80pub enum HighlightStyle {
81    /// ==text==
82    DoubleEqual,
83    /// <mark>text</mark>
84    Html,
85    /// **text**
86    Bold,
87    /// Plain text (no formatting)
88    None,
89}
90
91impl Default for HighlightStyle {
92    fn default() -> Self {
93        Self::DoubleEqual
94    }
95}
96
97/// Preprocessing preset levels.
98#[derive(Debug, Clone, Copy, PartialEq, Eq)]
99pub enum PreprocessingPreset {
100    Minimal,
101    Standard,
102    Aggressive,
103}
104
105impl Default for PreprocessingPreset {
106    fn default() -> Self {
107        Self::Standard
108    }
109}
110
111/// Main conversion options.
112#[derive(Debug, Clone)]
113pub struct ConversionOptions {
114    /// Heading style
115    pub heading_style: HeadingStyle,
116
117    /// List indentation type
118    pub list_indent_type: ListIndentType,
119
120    /// List indentation width (spaces)
121    pub list_indent_width: usize,
122
123    /// Bullet characters for unordered lists
124    pub bullets: String,
125
126    /// Symbol for strong/emphasis (* or _)
127    pub strong_em_symbol: char,
128
129    /// Escape asterisks in text
130    pub escape_asterisks: bool,
131
132    /// Escape underscores in text
133    pub escape_underscores: bool,
134
135    /// Escape misc markdown characters
136    pub escape_misc: bool,
137
138    /// Escape all ASCII punctuation (for CommonMark spec compliance tests)
139    pub escape_ascii: bool,
140
141    /// Default code language
142    pub code_language: String,
143
144    /// Use autolinks for bare URLs
145    pub autolinks: bool,
146
147    /// Add default title if none exists
148    pub default_title: bool,
149
150    /// Use <br> in tables instead of spaces
151    pub br_in_tables: bool,
152
153    /// Highlight style for <mark> elements
154    pub highlight_style: HighlightStyle,
155
156    /// Extract metadata from HTML
157    pub extract_metadata: bool,
158
159    /// Whitespace handling mode
160    pub whitespace_mode: WhitespaceMode,
161
162    /// Strip newlines from HTML before processing
163    pub strip_newlines: bool,
164
165    /// Enable text wrapping
166    pub wrap: bool,
167
168    /// Text wrap width
169    pub wrap_width: usize,
170
171    /// Treat block elements as inline
172    pub convert_as_inline: bool,
173
174    /// Subscript symbol
175    pub sub_symbol: String,
176
177    /// Superscript symbol
178    pub sup_symbol: String,
179
180    /// Newline style
181    pub newline_style: NewlineStyle,
182
183    /// Code block style
184    pub code_block_style: CodeBlockStyle,
185
186    /// Elements where images should remain as markdown (not converted to alt text)
187    pub keep_inline_images_in: Vec<String>,
188
189    /// Enable hOCR table extraction
190    pub hocr_extract_tables: bool,
191
192    /// Column threshold for hOCR table detection (pixels)
193    pub hocr_table_column_threshold: u32,
194
195    /// Row threshold ratio for hOCR table detection
196    pub hocr_table_row_threshold_ratio: f64,
197
198    /// Preprocessing options
199    pub preprocessing: PreprocessingOptions,
200
201    /// Parsing options
202    pub parsing: ParsingOptions,
203
204    /// Enable debug mode with diagnostic warnings
205    pub debug: bool,
206
207    /// List of HTML tags to strip (output only text content, no markdown conversion)
208    pub strip_tags: Vec<String>,
209}
210
211impl Default for ConversionOptions {
212    fn default() -> Self {
213        Self {
214            heading_style: HeadingStyle::default(),
215            list_indent_type: ListIndentType::default(),
216            list_indent_width: 2,
217            bullets: "-".to_string(),
218            strong_em_symbol: '*',
219            escape_asterisks: false,
220            escape_underscores: false,
221            escape_misc: false,
222            escape_ascii: false,
223            code_language: String::new(),
224            autolinks: true,
225            default_title: false,
226            br_in_tables: false,
227            highlight_style: HighlightStyle::default(),
228            extract_metadata: true,
229            whitespace_mode: WhitespaceMode::default(),
230            strip_newlines: false,
231            wrap: false,
232            wrap_width: 80,
233            convert_as_inline: false,
234            sub_symbol: String::new(),
235            sup_symbol: String::new(),
236            newline_style: NewlineStyle::Spaces,
237            code_block_style: CodeBlockStyle::default(),
238            keep_inline_images_in: Vec::new(),
239            hocr_extract_tables: true,
240            hocr_table_column_threshold: 50,
241            hocr_table_row_threshold_ratio: 0.5,
242            preprocessing: PreprocessingOptions::default(),
243            parsing: ParsingOptions::default(),
244            debug: false,
245            strip_tags: Vec::new(),
246        }
247    }
248}
249
250/// HTML preprocessing options.
251#[derive(Debug, Clone)]
252pub struct PreprocessingOptions {
253    /// Enable preprocessing
254    pub enabled: bool,
255
256    /// Preprocessing preset
257    pub preset: PreprocessingPreset,
258
259    /// Remove navigation elements
260    pub remove_navigation: bool,
261
262    /// Remove form elements
263    pub remove_forms: bool,
264}
265
266impl Default for PreprocessingOptions {
267    fn default() -> Self {
268        Self {
269            enabled: false,
270            preset: PreprocessingPreset::default(),
271            remove_navigation: true,
272            remove_forms: true,
273        }
274    }
275}
276
277/// HTML parsing options.
278#[derive(Debug, Clone)]
279pub struct ParsingOptions {
280    /// Source encoding
281    pub encoding: String,
282
283    /// HTML parser to use
284    pub parser: Option<String>,
285}
286
287impl Default for ParsingOptions {
288    fn default() -> Self {
289        Self {
290            encoding: "utf-8".to_string(),
291            parser: None,
292        }
293    }
294}