html_to_markdown_rs/
options.rs

1//! Configuration options for HTML to Markdown conversion.
2
3/// Heading style options.
4#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
5pub enum HeadingStyle {
6    /// Underlined style (=== for h1, --- for h2)
7    Underlined,
8    /// ATX style (# for h1, ## for h2, etc.)
9    #[default]
10    Atx,
11    /// ATX closed style (# title #)
12    AtxClosed,
13}
14
15impl HeadingStyle {
16    pub fn parse(value: &str) -> Self {
17        match normalize_token(value).as_str() {
18            "atx" => Self::Atx,
19            "atxclosed" => Self::AtxClosed,
20            _ => Self::Underlined,
21        }
22    }
23}
24
25/// List indentation type.
26#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
27pub enum ListIndentType {
28    #[default]
29    Spaces,
30    Tabs,
31}
32
33impl ListIndentType {
34    pub fn parse(value: &str) -> Self {
35        match normalize_token(value).as_str() {
36            "tabs" => Self::Tabs,
37            _ => Self::Spaces,
38        }
39    }
40}
41
42/// Whitespace handling mode.
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
44pub enum WhitespaceMode {
45    #[default]
46    Normalized,
47    Strict,
48}
49
50impl WhitespaceMode {
51    pub fn parse(value: &str) -> Self {
52        match normalize_token(value).as_str() {
53            "strict" => Self::Strict,
54            _ => Self::Normalized,
55        }
56    }
57}
58
59/// Newline style.
60#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
61pub enum NewlineStyle {
62    /// Two spaces at end of line
63    #[default]
64    Spaces,
65    /// Backslash at end of line
66    Backslash,
67}
68
69impl NewlineStyle {
70    pub fn parse(value: &str) -> Self {
71        match normalize_token(value).as_str() {
72            "backslash" => Self::Backslash,
73            _ => Self::Spaces,
74        }
75    }
76}
77
78/// Code block style.
79#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
80pub enum CodeBlockStyle {
81    /// Indented code blocks (4 spaces) - CommonMark default
82    #[default]
83    Indented,
84    /// Fenced code blocks with backticks (```)
85    Backticks,
86    /// Fenced code blocks with tildes (~~~)
87    Tildes,
88}
89
90impl CodeBlockStyle {
91    pub fn parse(value: &str) -> Self {
92        match normalize_token(value).as_str() {
93            "backticks" => Self::Backticks,
94            "tildes" => Self::Tildes,
95            _ => Self::Indented,
96        }
97    }
98}
99
100/// Highlight style for `<mark>` elements.
101#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
102pub enum HighlightStyle {
103    /// ==text==
104    #[default]
105    DoubleEqual,
106    /// <mark>text</mark>
107    Html,
108    /// **text**
109    Bold,
110    /// Plain text (no formatting)
111    None,
112}
113
114impl HighlightStyle {
115    pub fn parse(value: &str) -> Self {
116        match normalize_token(value).as_str() {
117            "doubleequal" => Self::DoubleEqual,
118            "html" => Self::Html,
119            "bold" => Self::Bold,
120            "none" => Self::None,
121            _ => Self::None,
122        }
123    }
124}
125
126/// Preprocessing preset levels.
127#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
128pub enum PreprocessingPreset {
129    Minimal,
130    #[default]
131    Standard,
132    Aggressive,
133}
134
135impl PreprocessingPreset {
136    pub fn parse(value: &str) -> Self {
137        match normalize_token(value).as_str() {
138            "minimal" => Self::Minimal,
139            "aggressive" => Self::Aggressive,
140            _ => Self::Standard,
141        }
142    }
143}
144
145/// Main conversion options.
146#[derive(Debug, Clone)]
147pub struct ConversionOptions {
148    /// Heading style
149    pub heading_style: HeadingStyle,
150
151    /// List indentation type
152    pub list_indent_type: ListIndentType,
153
154    /// List indentation width (spaces)
155    pub list_indent_width: usize,
156
157    /// Bullet characters for unordered lists
158    pub bullets: String,
159
160    /// Symbol for strong/emphasis (* or _)
161    pub strong_em_symbol: char,
162
163    /// Escape asterisks in text
164    pub escape_asterisks: bool,
165
166    /// Escape underscores in text
167    pub escape_underscores: bool,
168
169    /// Escape misc markdown characters
170    pub escape_misc: bool,
171
172    /// Escape all ASCII punctuation (for CommonMark spec compliance tests)
173    pub escape_ascii: bool,
174
175    /// Default code language
176    pub code_language: String,
177
178    /// Use autolinks for bare URLs
179    pub autolinks: bool,
180
181    /// Add default title if none exists
182    pub default_title: bool,
183
184    /// Use <br> in tables instead of spaces
185    pub br_in_tables: bool,
186
187    /// Enable spatial table reconstruction in hOCR documents
188    pub hocr_spatial_tables: bool,
189
190    /// Highlight style for <mark> elements
191    pub highlight_style: HighlightStyle,
192
193    /// Extract metadata from HTML
194    pub extract_metadata: bool,
195
196    /// Whitespace handling mode
197    pub whitespace_mode: WhitespaceMode,
198
199    /// Strip newlines from HTML before processing
200    pub strip_newlines: bool,
201
202    /// Enable text wrapping
203    pub wrap: bool,
204
205    /// Text wrap width
206    pub wrap_width: usize,
207
208    /// Treat block elements as inline
209    pub convert_as_inline: bool,
210
211    /// Subscript symbol
212    pub sub_symbol: String,
213
214    /// Superscript symbol
215    pub sup_symbol: String,
216
217    /// Newline style
218    pub newline_style: NewlineStyle,
219
220    /// Code block style
221    pub code_block_style: CodeBlockStyle,
222
223    /// Elements where images should remain as markdown (not converted to alt text)
224    pub keep_inline_images_in: Vec<String>,
225
226    /// Preprocessing options
227    pub preprocessing: PreprocessingOptions,
228
229    /// Source encoding (informational)
230    pub encoding: String,
231
232    /// Enable debug mode with diagnostic warnings
233    pub debug: bool,
234
235    /// List of HTML tags to strip (output only text content, no markdown conversion)
236    pub strip_tags: Vec<String>,
237
238    /// List of HTML tags to preserve as-is in the output (keep original HTML)
239    /// Useful for complex elements like tables that don't convert well to Markdown
240    pub preserve_tags: Vec<String>,
241}
242
243/// Partial update for ConversionOptions.
244#[derive(Debug, Clone, Default)]
245#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
246#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
247pub struct ConversionOptionsUpdate {
248    pub heading_style: Option<HeadingStyle>,
249    pub list_indent_type: Option<ListIndentType>,
250    pub list_indent_width: Option<usize>,
251    pub bullets: Option<String>,
252    pub strong_em_symbol: Option<char>,
253    pub escape_asterisks: Option<bool>,
254    pub escape_underscores: Option<bool>,
255    pub escape_misc: Option<bool>,
256    pub escape_ascii: Option<bool>,
257    pub code_language: Option<String>,
258    pub autolinks: Option<bool>,
259    pub default_title: Option<bool>,
260    pub br_in_tables: Option<bool>,
261    pub hocr_spatial_tables: Option<bool>,
262    pub highlight_style: Option<HighlightStyle>,
263    pub extract_metadata: Option<bool>,
264    pub whitespace_mode: Option<WhitespaceMode>,
265    pub strip_newlines: Option<bool>,
266    pub wrap: Option<bool>,
267    pub wrap_width: Option<usize>,
268    pub convert_as_inline: Option<bool>,
269    pub sub_symbol: Option<String>,
270    pub sup_symbol: Option<String>,
271    pub newline_style: Option<NewlineStyle>,
272    pub code_block_style: Option<CodeBlockStyle>,
273    pub keep_inline_images_in: Option<Vec<String>>,
274    pub preprocessing: Option<PreprocessingOptionsUpdate>,
275    pub encoding: Option<String>,
276    pub debug: Option<bool>,
277    pub strip_tags: Option<Vec<String>>,
278    pub preserve_tags: Option<Vec<String>>,
279}
280
281impl Default for ConversionOptions {
282    fn default() -> Self {
283        Self {
284            heading_style: HeadingStyle::default(),
285            list_indent_type: ListIndentType::default(),
286            list_indent_width: 2,
287            bullets: "-".to_string(),
288            strong_em_symbol: '*',
289            escape_asterisks: false,
290            escape_underscores: false,
291            escape_misc: false,
292            escape_ascii: false,
293            code_language: String::new(),
294            autolinks: true,
295            default_title: false,
296            br_in_tables: false,
297            hocr_spatial_tables: true,
298            highlight_style: HighlightStyle::default(),
299            extract_metadata: true,
300            whitespace_mode: WhitespaceMode::default(),
301            strip_newlines: false,
302            wrap: false,
303            wrap_width: 80,
304            convert_as_inline: false,
305            sub_symbol: String::new(),
306            sup_symbol: String::new(),
307            newline_style: NewlineStyle::Spaces,
308            code_block_style: CodeBlockStyle::default(),
309            keep_inline_images_in: Vec::new(),
310            preprocessing: PreprocessingOptions::default(),
311            encoding: "utf-8".to_string(),
312            debug: false,
313            strip_tags: Vec::new(),
314            preserve_tags: Vec::new(),
315        }
316    }
317}
318
319impl ConversionOptions {
320    pub fn apply_update(&mut self, update: ConversionOptionsUpdate) {
321        if let Some(heading_style) = update.heading_style {
322            self.heading_style = heading_style;
323        }
324        if let Some(list_indent_type) = update.list_indent_type {
325            self.list_indent_type = list_indent_type;
326        }
327        if let Some(list_indent_width) = update.list_indent_width {
328            self.list_indent_width = list_indent_width;
329        }
330        if let Some(bullets) = update.bullets {
331            self.bullets = bullets;
332        }
333        if let Some(strong_em_symbol) = update.strong_em_symbol {
334            self.strong_em_symbol = strong_em_symbol;
335        }
336        if let Some(escape_asterisks) = update.escape_asterisks {
337            self.escape_asterisks = escape_asterisks;
338        }
339        if let Some(escape_underscores) = update.escape_underscores {
340            self.escape_underscores = escape_underscores;
341        }
342        if let Some(escape_misc) = update.escape_misc {
343            self.escape_misc = escape_misc;
344        }
345        if let Some(escape_ascii) = update.escape_ascii {
346            self.escape_ascii = escape_ascii;
347        }
348        if let Some(code_language) = update.code_language {
349            self.code_language = code_language;
350        }
351        if let Some(autolinks) = update.autolinks {
352            self.autolinks = autolinks;
353        }
354        if let Some(default_title) = update.default_title {
355            self.default_title = default_title;
356        }
357        if let Some(br_in_tables) = update.br_in_tables {
358            self.br_in_tables = br_in_tables;
359        }
360        if let Some(hocr_spatial_tables) = update.hocr_spatial_tables {
361            self.hocr_spatial_tables = hocr_spatial_tables;
362        }
363        if let Some(highlight_style) = update.highlight_style {
364            self.highlight_style = highlight_style;
365        }
366        if let Some(extract_metadata) = update.extract_metadata {
367            self.extract_metadata = extract_metadata;
368        }
369        if let Some(whitespace_mode) = update.whitespace_mode {
370            self.whitespace_mode = whitespace_mode;
371        }
372        if let Some(strip_newlines) = update.strip_newlines {
373            self.strip_newlines = strip_newlines;
374        }
375        if let Some(wrap) = update.wrap {
376            self.wrap = wrap;
377        }
378        if let Some(wrap_width) = update.wrap_width {
379            self.wrap_width = wrap_width;
380        }
381        if let Some(convert_as_inline) = update.convert_as_inline {
382            self.convert_as_inline = convert_as_inline;
383        }
384        if let Some(sub_symbol) = update.sub_symbol {
385            self.sub_symbol = sub_symbol;
386        }
387        if let Some(sup_symbol) = update.sup_symbol {
388            self.sup_symbol = sup_symbol;
389        }
390        if let Some(newline_style) = update.newline_style {
391            self.newline_style = newline_style;
392        }
393        if let Some(code_block_style) = update.code_block_style {
394            self.code_block_style = code_block_style;
395        }
396        if let Some(keep_inline_images_in) = update.keep_inline_images_in {
397            self.keep_inline_images_in = keep_inline_images_in;
398        }
399        if let Some(preprocessing) = update.preprocessing {
400            self.preprocessing.apply_update(preprocessing);
401        }
402        if let Some(encoding) = update.encoding {
403            self.encoding = encoding;
404        }
405        if let Some(debug) = update.debug {
406            self.debug = debug;
407        }
408        if let Some(strip_tags) = update.strip_tags {
409            self.strip_tags = strip_tags;
410        }
411        if let Some(preserve_tags) = update.preserve_tags {
412            self.preserve_tags = preserve_tags;
413        }
414    }
415
416    pub fn from_update(update: ConversionOptionsUpdate) -> Self {
417        let mut options = Self::default();
418        options.apply_update(update);
419        options
420    }
421}
422
423impl From<ConversionOptionsUpdate> for ConversionOptions {
424    fn from(update: ConversionOptionsUpdate) -> Self {
425        Self::from_update(update)
426    }
427}
428
429/// HTML preprocessing options.
430#[derive(Debug, Clone)]
431pub struct PreprocessingOptions {
432    /// Enable preprocessing
433    pub enabled: bool,
434
435    /// Preprocessing preset
436    pub preset: PreprocessingPreset,
437
438    /// Remove navigation elements
439    pub remove_navigation: bool,
440
441    /// Remove form elements
442    pub remove_forms: bool,
443}
444
445/// Partial update for PreprocessingOptions.
446#[derive(Debug, Clone, Default)]
447#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
448#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
449pub struct PreprocessingOptionsUpdate {
450    pub enabled: Option<bool>,
451    pub preset: Option<PreprocessingPreset>,
452    pub remove_navigation: Option<bool>,
453    pub remove_forms: Option<bool>,
454}
455
456fn normalize_token(value: &str) -> String {
457    let mut out = String::with_capacity(value.len());
458    for ch in value.chars() {
459        if ch.is_ascii_alphanumeric() {
460            out.push(ch.to_ascii_lowercase());
461        }
462    }
463    out
464}
465
466#[cfg(any(feature = "serde", feature = "metadata"))]
467mod serde_impls {
468    use super::*;
469    use serde::Deserialize;
470
471    macro_rules! impl_deserialize_from_parse {
472        ($ty:ty, $parser:expr) => {
473            impl<'de> Deserialize<'de> for $ty {
474                fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
475                where
476                    D: serde::Deserializer<'de>,
477                {
478                    let value = String::deserialize(deserializer)?;
479                    Ok($parser(&value))
480                }
481            }
482        };
483    }
484
485    impl_deserialize_from_parse!(HeadingStyle, HeadingStyle::parse);
486    impl_deserialize_from_parse!(ListIndentType, ListIndentType::parse);
487    impl_deserialize_from_parse!(WhitespaceMode, WhitespaceMode::parse);
488    impl_deserialize_from_parse!(NewlineStyle, NewlineStyle::parse);
489    impl_deserialize_from_parse!(CodeBlockStyle, CodeBlockStyle::parse);
490    impl_deserialize_from_parse!(HighlightStyle, HighlightStyle::parse);
491    impl_deserialize_from_parse!(PreprocessingPreset, PreprocessingPreset::parse);
492}
493
494impl Default for PreprocessingOptions {
495    fn default() -> Self {
496        Self {
497            enabled: false,
498            preset: PreprocessingPreset::default(),
499            remove_navigation: true,
500            remove_forms: true,
501        }
502    }
503}
504
505impl PreprocessingOptions {
506    pub fn apply_update(&mut self, update: PreprocessingOptionsUpdate) {
507        if let Some(enabled) = update.enabled {
508            self.enabled = enabled;
509        }
510        if let Some(preset) = update.preset {
511            self.preset = preset;
512        }
513        if let Some(remove_navigation) = update.remove_navigation {
514            self.remove_navigation = remove_navigation;
515        }
516        if let Some(remove_forms) = update.remove_forms {
517            self.remove_forms = remove_forms;
518        }
519    }
520
521    pub fn from_update(update: PreprocessingOptionsUpdate) -> Self {
522        let mut options = Self::default();
523        options.apply_update(update);
524        options
525    }
526}
527
528impl From<PreprocessingOptionsUpdate> for PreprocessingOptions {
529    fn from(update: PreprocessingOptionsUpdate) -> Self {
530        Self::from_update(update)
531    }
532}