Skip to main content

html_to_markdown_rs/options/
conversion.rs

1#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2
3//! Main conversion options and their builder utilities.
4//!
5//! This module provides the primary `ConversionOptions` struct with all configuration
6//! settings for HTML to Markdown conversion, along with partial update support for
7//! selective option modifications.
8
9use crate::options::preprocessing::PreprocessingOptions;
10use crate::options::preprocessing::PreprocessingOptionsUpdate;
11use crate::options::validation::{
12    CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
13};
14
15/// Main conversion options for HTML to Markdown conversion.
16#[derive(Debug, Clone)]
17#[cfg_attr(
18    any(feature = "serde", feature = "metadata"),
19    derive(serde::Serialize, serde::Deserialize)
20)]
21#[cfg_attr(
22    any(feature = "serde", feature = "metadata"),
23    serde(rename_all = "camelCase", default)
24)]
25pub struct ConversionOptions {
26    /// Heading style (Underlined, Atx, `AtxClosed`)
27    pub heading_style: HeadingStyle,
28
29    /// List indentation type (Spaces or Tabs)
30    pub list_indent_type: ListIndentType,
31
32    /// List indentation width in spaces (applied if using spaces indentation)
33    pub list_indent_width: usize,
34
35    /// Bullet characters for unordered lists (e.g., "-", "*", "+")
36    pub bullets: String,
37
38    /// Symbol for strong/emphasis emphasis rendering (* or _)
39    pub strong_em_symbol: char,
40
41    /// Escape asterisks (*) in text to prevent accidental formatting
42    pub escape_asterisks: bool,
43
44    /// Escape underscores (_) in text to prevent accidental formatting
45    pub escape_underscores: bool,
46
47    /// Escape miscellaneous markdown characters (\ & < ` [ > ~ # = + | -)
48    pub escape_misc: bool,
49
50    /// Escape all ASCII punctuation characters (for `CommonMark` spec compliance tests)
51    pub escape_ascii: bool,
52
53    /// Default code language for fenced code blocks when not specified
54    pub code_language: String,
55
56    /// Use autolinks syntax for bare URLs (<http://example.com>)
57    pub autolinks: bool,
58
59    /// Add default title element to HTML if none exists before conversion
60    pub default_title: bool,
61
62    /// Use HTML <br> elements in tables instead of spaces for line breaks
63    pub br_in_tables: bool,
64
65    /// Enable spatial table reconstruction in hOCR documents (via spatial positioning analysis)
66    pub hocr_spatial_tables: bool,
67
68    /// Highlight style for <mark> elements (`DoubleEqual`, Html, Bold, None)
69    pub highlight_style: HighlightStyle,
70
71    /// Extract metadata from HTML (title, description, images, links, etc.)
72    pub extract_metadata: bool,
73
74    /// Whitespace handling mode (Normalized collapses multiple spaces, Strict preserves)
75    pub whitespace_mode: WhitespaceMode,
76
77    /// Strip newline characters from HTML before processing
78    pub strip_newlines: bool,
79
80    /// Enable automatic text wrapping at `wrap_width`
81    pub wrap: bool,
82
83    /// Text wrapping width in characters (default 80)
84    pub wrap_width: usize,
85
86    /// Treat block-level elements as inline during conversion
87    pub convert_as_inline: bool,
88
89    /// Custom symbol for subscript content (e.g., "~")
90    pub sub_symbol: String,
91
92    /// Custom symbol for superscript content (e.g., "^")
93    pub sup_symbol: String,
94
95    /// Newline style in markdown output (Spaces adds two spaces, Backslash adds \)
96    pub newline_style: NewlineStyle,
97
98    /// Code block fence style (Indented, Backticks, Tildes)
99    pub code_block_style: CodeBlockStyle,
100
101    /// HTML elements where images should remain as markdown links (not converted to alt text)
102    pub keep_inline_images_in: Vec<String>,
103
104    /// HTML preprocessing options (remove nav, forms, etc.)
105    pub preprocessing: PreprocessingOptions,
106
107    /// Source document encoding (informational, typically "utf-8")
108    pub encoding: String,
109
110    /// Enable debug mode with diagnostic warnings on conversion issues
111    pub debug: bool,
112
113    /// HTML tags to strip (extract text content, no markdown conversion)
114    pub strip_tags: Vec<String>,
115
116    /// HTML tags to preserve as-is in output (keep original HTML, useful for complex tables)
117    pub preserve_tags: Vec<String>,
118
119    /// Skip all images during conversion.
120    /// When enabled, all `<img>` elements are completely omitted from output.
121    /// Useful for text-only extraction or filtering out visual content.
122    pub skip_images: bool,
123
124    /// Output format for conversion (Markdown or Djot)
125    pub output_format: OutputFormat,
126}
127
128/// Partial update for `ConversionOptions`.
129///
130/// This struct uses `Option<T>` to represent optional fields that can be selectively updated.
131/// Only specified fields (Some values) will override existing options; None values leave the
132/// corresponding fields unchanged when applied via [`ConversionOptions::apply_update`].
133#[derive(Debug, Clone, Default)]
134#[cfg_attr(
135    any(feature = "serde", feature = "metadata"),
136    derive(serde::Serialize, serde::Deserialize)
137)]
138#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
139pub struct ConversionOptionsUpdate {
140    /// Optional heading style override (Underlined, Atx, `AtxClosed`)
141    pub heading_style: Option<HeadingStyle>,
142
143    /// Optional list indentation type override (Spaces or Tabs)
144    pub list_indent_type: Option<ListIndentType>,
145
146    /// Optional list indentation width override in spaces
147    pub list_indent_width: Option<usize>,
148
149    /// Optional bullet characters override for unordered lists
150    pub bullets: Option<String>,
151
152    /// Optional strong/emphasis symbol override (* or _)
153    pub strong_em_symbol: Option<char>,
154
155    /// Optional asterisk escaping override in text content
156    pub escape_asterisks: Option<bool>,
157
158    /// Optional underscore escaping override in text content
159    pub escape_underscores: Option<bool>,
160
161    /// Optional miscellaneous character escaping override (\ & < ` [ > ~ # = + | -)
162    pub escape_misc: Option<bool>,
163
164    /// Optional ASCII punctuation escaping override (for spec compliance testing)
165    pub escape_ascii: Option<bool>,
166
167    /// Optional default code language override for fenced code blocks
168    pub code_language: Option<String>,
169
170    /// Optional autolinks syntax override for bare URLs
171    pub autolinks: Option<bool>,
172
173    /// Optional default title element injection override
174    pub default_title: Option<bool>,
175
176    /// Optional HTML <br> usage in tables override
177    pub br_in_tables: Option<bool>,
178
179    /// Optional spatial table reconstruction for hOCR documents override
180    pub hocr_spatial_tables: Option<bool>,
181
182    /// Optional highlight style override for <mark> elements
183    pub highlight_style: Option<HighlightStyle>,
184
185    /// Optional metadata extraction override (title, description, images, links)
186    pub extract_metadata: Option<bool>,
187
188    /// Optional whitespace handling mode override (Normalized or Strict)
189    pub whitespace_mode: Option<WhitespaceMode>,
190
191    /// Optional newline stripping override before processing
192    pub strip_newlines: Option<bool>,
193
194    /// Optional automatic text wrapping override
195    pub wrap: Option<bool>,
196
197    /// Optional text wrapping width override in characters
198    pub wrap_width: Option<usize>,
199
200    /// Optional block-level to inline conversion override
201    pub convert_as_inline: Option<bool>,
202
203    /// Optional subscript symbol override
204    pub sub_symbol: Option<String>,
205
206    /// Optional superscript symbol override
207    pub sup_symbol: Option<String>,
208
209    /// Optional newline style override for markdown output
210    pub newline_style: Option<NewlineStyle>,
211
212    /// Optional code block fence style override (Indented, Backticks, Tildes)
213    pub code_block_style: Option<CodeBlockStyle>,
214
215    /// Optional context elements where images remain as markdown links override
216    pub keep_inline_images_in: Option<Vec<String>>,
217
218    /// Optional preprocessing options partial update
219    pub preprocessing: Option<PreprocessingOptionsUpdate>,
220
221    /// Optional source document encoding override
222    pub encoding: Option<String>,
223
224    /// Optional debug mode override for diagnostic warnings
225    pub debug: Option<bool>,
226
227    /// Optional HTML tags to strip override (extract text, no conversion)
228    pub strip_tags: Option<Vec<String>>,
229
230    /// Optional HTML tags to preserve as-is override in output
231    pub preserve_tags: Option<Vec<String>>,
232
233    /// Optional skip images override
234    pub skip_images: Option<bool>,
235
236    /// Optional output format override (Markdown or Djot)
237    pub output_format: Option<OutputFormat>,
238}
239
240impl Default for ConversionOptions {
241    fn default() -> Self {
242        Self {
243            heading_style: HeadingStyle::default(),
244            list_indent_type: ListIndentType::default(),
245            list_indent_width: 2,
246            bullets: "-".to_string(),
247            strong_em_symbol: '*',
248            escape_asterisks: false,
249            escape_underscores: false,
250            escape_misc: false,
251            escape_ascii: false,
252            code_language: String::new(),
253            autolinks: true,
254            default_title: false,
255            br_in_tables: false,
256            hocr_spatial_tables: true,
257            highlight_style: HighlightStyle::default(),
258            extract_metadata: true,
259            whitespace_mode: WhitespaceMode::default(),
260            strip_newlines: false,
261            wrap: false,
262            wrap_width: 80,
263            convert_as_inline: false,
264            sub_symbol: String::new(),
265            sup_symbol: String::new(),
266            newline_style: NewlineStyle::Spaces,
267            code_block_style: CodeBlockStyle::default(),
268            keep_inline_images_in: Vec::new(),
269            preprocessing: PreprocessingOptions::default(),
270            encoding: "utf-8".to_string(),
271            debug: false,
272            strip_tags: Vec::new(),
273            preserve_tags: Vec::new(),
274            skip_images: false,
275            output_format: OutputFormat::default(),
276        }
277    }
278}
279
280impl ConversionOptions {
281    /// Apply a partial update to these conversion options.
282    ///
283    /// Any specified fields in the update will override the current values.
284    /// Unspecified fields (None) are left unchanged.
285    ///
286    /// # Arguments
287    ///
288    /// * `update` - Partial options update with fields to override
289    pub fn apply_update(&mut self, update: ConversionOptionsUpdate) {
290        if let Some(heading_style) = update.heading_style {
291            self.heading_style = heading_style;
292        }
293        if let Some(list_indent_type) = update.list_indent_type {
294            self.list_indent_type = list_indent_type;
295        }
296        if let Some(list_indent_width) = update.list_indent_width {
297            self.list_indent_width = list_indent_width;
298        }
299        if let Some(bullets) = update.bullets {
300            self.bullets = bullets;
301        }
302        if let Some(strong_em_symbol) = update.strong_em_symbol {
303            self.strong_em_symbol = strong_em_symbol;
304        }
305        if let Some(escape_asterisks) = update.escape_asterisks {
306            self.escape_asterisks = escape_asterisks;
307        }
308        if let Some(escape_underscores) = update.escape_underscores {
309            self.escape_underscores = escape_underscores;
310        }
311        if let Some(escape_misc) = update.escape_misc {
312            self.escape_misc = escape_misc;
313        }
314        if let Some(escape_ascii) = update.escape_ascii {
315            self.escape_ascii = escape_ascii;
316        }
317        if let Some(code_language) = update.code_language {
318            self.code_language = code_language;
319        }
320        if let Some(autolinks) = update.autolinks {
321            self.autolinks = autolinks;
322        }
323        if let Some(default_title) = update.default_title {
324            self.default_title = default_title;
325        }
326        if let Some(br_in_tables) = update.br_in_tables {
327            self.br_in_tables = br_in_tables;
328        }
329        if let Some(hocr_spatial_tables) = update.hocr_spatial_tables {
330            self.hocr_spatial_tables = hocr_spatial_tables;
331        }
332        if let Some(highlight_style) = update.highlight_style {
333            self.highlight_style = highlight_style;
334        }
335        if let Some(extract_metadata) = update.extract_metadata {
336            self.extract_metadata = extract_metadata;
337        }
338        if let Some(whitespace_mode) = update.whitespace_mode {
339            self.whitespace_mode = whitespace_mode;
340        }
341        if let Some(strip_newlines) = update.strip_newlines {
342            self.strip_newlines = strip_newlines;
343        }
344        if let Some(wrap) = update.wrap {
345            self.wrap = wrap;
346        }
347        if let Some(wrap_width) = update.wrap_width {
348            self.wrap_width = wrap_width;
349        }
350        if let Some(convert_as_inline) = update.convert_as_inline {
351            self.convert_as_inline = convert_as_inline;
352        }
353        if let Some(sub_symbol) = update.sub_symbol {
354            self.sub_symbol = sub_symbol;
355        }
356        if let Some(sup_symbol) = update.sup_symbol {
357            self.sup_symbol = sup_symbol;
358        }
359        if let Some(newline_style) = update.newline_style {
360            self.newline_style = newline_style;
361        }
362        if let Some(code_block_style) = update.code_block_style {
363            self.code_block_style = code_block_style;
364        }
365        if let Some(keep_inline_images_in) = update.keep_inline_images_in {
366            self.keep_inline_images_in = keep_inline_images_in;
367        }
368        if let Some(preprocessing) = update.preprocessing {
369            self.preprocessing.apply_update(preprocessing);
370        }
371        if let Some(encoding) = update.encoding {
372            self.encoding = encoding;
373        }
374        if let Some(debug) = update.debug {
375            self.debug = debug;
376        }
377        if let Some(strip_tags) = update.strip_tags {
378            self.strip_tags = strip_tags;
379        }
380        if let Some(preserve_tags) = update.preserve_tags {
381            self.preserve_tags = preserve_tags;
382        }
383        if let Some(skip_images) = update.skip_images {
384            self.skip_images = skip_images;
385        }
386        if let Some(output_format) = update.output_format {
387            self.output_format = output_format;
388        }
389    }
390
391    /// Create new conversion options from a partial update.
392    ///
393    /// Creates a new `ConversionOptions` struct with defaults, then applies the update.
394    /// Fields not specified in the update keep their default values.
395    ///
396    /// # Arguments
397    ///
398    /// * `update` - Partial options update with fields to set
399    ///
400    /// # Returns
401    ///
402    /// New `ConversionOptions` with specified updates applied to defaults
403    #[must_use]
404    pub fn from_update(update: ConversionOptionsUpdate) -> Self {
405        let mut options = Self::default();
406        options.apply_update(update);
407        options
408    }
409}
410
411impl From<ConversionOptionsUpdate> for ConversionOptions {
412    fn from(update: ConversionOptionsUpdate) -> Self {
413        Self::from_update(update)
414    }
415}
416
417#[cfg(all(test, any(feature = "serde", feature = "metadata")))]
418mod tests {
419    use super::*;
420
421    #[test]
422    fn test_conversion_options_serde() {
423        let mut options = ConversionOptions::default();
424        options.heading_style = HeadingStyle::AtxClosed;
425        options.list_indent_width = 4;
426        options.bullets = "*".to_string();
427        options.escape_asterisks = true;
428        options.whitespace_mode = WhitespaceMode::Strict;
429
430        // Serialize to JSON
431        let json = serde_json::to_string(&options).expect("Failed to serialize");
432
433        // Deserialize back
434        let deserialized: ConversionOptions = serde_json::from_str(&json).expect("Failed to deserialize");
435
436        // Verify values
437        assert_eq!(deserialized.list_indent_width, 4);
438        assert_eq!(deserialized.bullets, "*");
439        assert_eq!(deserialized.escape_asterisks, true);
440        assert_eq!(deserialized.heading_style, HeadingStyle::AtxClosed);
441        assert_eq!(deserialized.whitespace_mode, WhitespaceMode::Strict);
442    }
443
444    #[test]
445    fn test_conversion_options_partial_deserialization() {
446        // Test that partial JSON can be deserialized using defaults for missing fields
447        let partial_json = r#"{
448            "headingStyle": "atxClosed",
449            "listIndentWidth": 4,
450            "bullets": "*"
451        }"#;
452
453        let deserialized: ConversionOptions =
454            serde_json::from_str(partial_json).expect("Failed to deserialize partial JSON");
455
456        // Verify specified values
457        assert_eq!(deserialized.heading_style, HeadingStyle::AtxClosed);
458        assert_eq!(deserialized.list_indent_width, 4);
459        assert_eq!(deserialized.bullets, "*");
460
461        // Verify missing fields use defaults
462        assert_eq!(deserialized.escape_asterisks, false); // default
463        assert_eq!(deserialized.escape_underscores, false); // default
464        assert_eq!(deserialized.list_indent_type, ListIndentType::Spaces); // default
465    }
466}