Skip to main content

html_to_markdown_rs/options/
conversion.rs

1#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2
3//! Main conversion options with builder pattern.
4
5use crate::options::preprocessing::PreprocessingOptions;
6use crate::options::validation::{
7    CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
8};
9
10/// Main conversion options for HTML to Markdown conversion.
11///
12/// Use [`ConversionOptions::builder()`] to construct, or [`Default::default()`] for defaults.
13///
14/// # Example
15///
16/// ```text
17/// use html_to_markdown_rs::ConversionOptions;
18///
19/// let options = ConversionOptions::builder()
20///     .heading_style(HeadingStyle::Atx)
21///     .wrap(true)
22///     .wrap_width(100)
23///     .build();
24/// ```
25#[derive(Debug, Clone)]
26#[cfg_attr(
27    any(feature = "serde", feature = "metadata"),
28    derive(serde::Serialize, serde::Deserialize)
29)]
30#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default, deny_unknown_fields))]
31pub struct ConversionOptions {
32    /// Heading style to use in Markdown output (ATX `#` or Setext underline).
33    pub heading_style: HeadingStyle,
34    /// How to indent nested list items (spaces or tab).
35    pub list_indent_type: ListIndentType,
36    /// Number of spaces (or tabs) to use for each level of list indentation.
37    pub list_indent_width: usize,
38    /// Bullet character(s) to use for unordered list items (e.g. `"-"`, `"*"`).
39    pub bullets: String,
40    /// Character used for bold/italic emphasis markers (`*` or `_`).
41    pub strong_em_symbol: char,
42    /// Escape `*` characters in plain text to avoid unintended bold/italic.
43    pub escape_asterisks: bool,
44    /// Escape `_` characters in plain text to avoid unintended bold/italic.
45    pub escape_underscores: bool,
46    /// Escape miscellaneous Markdown metacharacters (`[]()#` etc.) in plain text.
47    pub escape_misc: bool,
48    /// Escape ASCII characters that have special meaning in certain Markdown dialects.
49    pub escape_ascii: bool,
50    /// Default language annotation for fenced code blocks that have no language hint.
51    pub code_language: String,
52    /// Automatically convert bare URLs into Markdown autolinks.
53    pub autolinks: bool,
54    /// Emit a default title when no `<title>` tag is present.
55    pub default_title: bool,
56    /// Render `<br>` elements inside table cells as literal line breaks.
57    pub br_in_tables: bool,
58    /// Style used for `<mark>` / highlighted text (e.g. `==text==`).
59    pub highlight_style: HighlightStyle,
60    /// Extract `<meta>` and `<head>` information into the result metadata.
61    pub extract_metadata: bool,
62    /// Controls how whitespace is normalised during conversion.
63    pub whitespace_mode: WhitespaceMode,
64    /// Strip all newlines from the output, producing a single-line result.
65    pub strip_newlines: bool,
66    /// Wrap long lines at [`wrap_width`](Self::wrap_width) characters.
67    pub wrap: bool,
68    /// Maximum line width when [`wrap`](Self::wrap) is enabled (default `80`).
69    pub wrap_width: usize,
70    /// Treat the entire document as inline content (no block-level wrappers).
71    pub convert_as_inline: bool,
72    /// Markdown notation for subscript text (e.g. `"~"`).
73    pub sub_symbol: String,
74    /// Markdown notation for superscript text (e.g. `"^"`).
75    pub sup_symbol: String,
76    /// How to encode hard line breaks (`<br>`) in Markdown.
77    pub newline_style: NewlineStyle,
78    /// Style used for fenced code blocks (backticks or tilde).
79    pub code_block_style: CodeBlockStyle,
80    /// HTML tag names whose `<img>` children are kept inline instead of block.
81    pub keep_inline_images_in: Vec<String>,
82    /// Pre-processing options applied to the HTML before conversion.
83    pub preprocessing: PreprocessingOptions,
84    /// Expected character encoding of the input HTML (default `"utf-8"`).
85    pub encoding: String,
86    /// Emit debug information during conversion.
87    pub debug: bool,
88    /// HTML tag names whose content is stripped from the output entirely.
89    pub strip_tags: Vec<String>,
90    /// HTML tag names that are preserved verbatim in the output.
91    pub preserve_tags: Vec<String>,
92    /// Skip conversion of `<img>` elements (omit images from output).
93    pub skip_images: bool,
94    /// Link rendering style (inline or reference).
95    pub link_style: LinkStyle,
96    /// Target output format (Markdown, plain text, etc.).
97    pub output_format: OutputFormat,
98    /// Include structured document tree in result.
99    pub include_document_structure: bool,
100    /// Extract inline images from data URIs and SVGs.
101    pub extract_images: bool,
102    /// Maximum decoded image size in bytes (default 5MB).
103    pub max_image_size: u64,
104    /// Capture SVG elements as images.
105    pub capture_svg: bool,
106    /// Infer image dimensions from data.
107    pub infer_dimensions: bool,
108}
109
110impl Default for ConversionOptions {
111    fn default() -> Self {
112        Self {
113            heading_style: HeadingStyle::default(),
114            list_indent_type: ListIndentType::default(),
115            list_indent_width: 2,
116            bullets: "-*+".to_string(),
117            strong_em_symbol: '*',
118            escape_asterisks: false,
119            escape_underscores: false,
120            escape_misc: false,
121            escape_ascii: false,
122            code_language: String::new(),
123            autolinks: true,
124            default_title: false,
125            br_in_tables: false,
126            highlight_style: HighlightStyle::default(),
127            extract_metadata: true,
128            whitespace_mode: WhitespaceMode::default(),
129            strip_newlines: false,
130            wrap: false,
131            wrap_width: 80,
132            convert_as_inline: false,
133            sub_symbol: String::new(),
134            sup_symbol: String::new(),
135            newline_style: NewlineStyle::Spaces,
136            code_block_style: CodeBlockStyle::default(),
137            keep_inline_images_in: Vec::new(),
138            preprocessing: PreprocessingOptions::default(),
139            encoding: "utf-8".to_string(),
140            debug: false,
141            strip_tags: Vec::new(),
142            preserve_tags: Vec::new(),
143            skip_images: false,
144            link_style: LinkStyle::default(),
145            output_format: OutputFormat::default(),
146            include_document_structure: false,
147            extract_images: false,
148            max_image_size: 5_242_880,
149            capture_svg: false,
150            infer_dimensions: true,
151        }
152    }
153}
154
155impl ConversionOptions {
156    /// Create a new builder with default values.
157    #[must_use]
158    pub fn builder() -> ConversionOptionsBuilder {
159        ConversionOptionsBuilder(Self::default())
160    }
161}
162
163// ── Builder ─────────────────────────────────────────────────────────────────
164
165/// Builder for [`ConversionOptions`].
166///
167/// All fields start with default values. Call `.build()` to produce the final options.
168#[derive(Debug, Clone)]
169pub struct ConversionOptionsBuilder(ConversionOptions);
170
171macro_rules! builder_setter {
172    ($name:ident, $ty:ty) => {
173        /// Set the value.
174        #[must_use]
175        pub fn $name(mut self, value: $ty) -> Self {
176            self.0.$name = value;
177            self
178        }
179    };
180}
181
182macro_rules! builder_setter_into {
183    ($name:ident, $ty:ty) => {
184        /// Set the value.
185        #[must_use]
186        pub fn $name(mut self, value: impl Into<$ty>) -> Self {
187            self.0.$name = value.into();
188            self
189        }
190    };
191}
192
193impl ConversionOptionsBuilder {
194    // Output control
195    builder_setter!(output_format, OutputFormat);
196    builder_setter!(include_document_structure, bool);
197    builder_setter!(extract_metadata, bool);
198    builder_setter!(extract_images, bool);
199
200    // Markdown formatting
201    builder_setter!(heading_style, HeadingStyle);
202    builder_setter!(list_indent_type, ListIndentType);
203    builder_setter!(list_indent_width, usize);
204    builder_setter_into!(bullets, String);
205    builder_setter!(strong_em_symbol, char);
206    builder_setter!(code_block_style, CodeBlockStyle);
207    builder_setter!(newline_style, NewlineStyle);
208    builder_setter!(highlight_style, HighlightStyle);
209    builder_setter_into!(code_language, String);
210    builder_setter!(link_style, LinkStyle);
211    builder_setter!(autolinks, bool);
212    builder_setter!(default_title, bool);
213    builder_setter!(br_in_tables, bool);
214    builder_setter_into!(sub_symbol, String);
215    builder_setter_into!(sup_symbol, String);
216
217    // Escaping
218    builder_setter!(escape_asterisks, bool);
219    builder_setter!(escape_underscores, bool);
220    builder_setter!(escape_misc, bool);
221    builder_setter!(escape_ascii, bool);
222
223    // Whitespace / wrapping
224    builder_setter!(whitespace_mode, WhitespaceMode);
225    builder_setter!(strip_newlines, bool);
226    builder_setter!(wrap, bool);
227    builder_setter!(wrap_width, usize);
228
229    // Element handling
230    builder_setter!(convert_as_inline, bool);
231    builder_setter!(skip_images, bool);
232
233    /// Set the list of HTML tag names whose content is stripped from output.
234    #[must_use]
235    pub fn strip_tags(mut self, tags: Vec<String>) -> Self {
236        self.0.strip_tags = tags;
237        self
238    }
239
240    /// Set the list of HTML tag names that are preserved verbatim in output.
241    #[must_use]
242    pub fn preserve_tags(mut self, tags: Vec<String>) -> Self {
243        self.0.preserve_tags = tags;
244        self
245    }
246
247    /// Set the list of HTML tag names whose `<img>` children are kept inline.
248    #[must_use]
249    pub fn keep_inline_images_in(mut self, tags: Vec<String>) -> Self {
250        self.0.keep_inline_images_in = tags;
251        self
252    }
253
254    // Image extraction config
255    builder_setter!(max_image_size, u64);
256    builder_setter!(capture_svg, bool);
257    builder_setter!(infer_dimensions, bool);
258
259    // Preprocessing
260    /// Set the pre-processing options applied to the HTML before conversion.
261    #[must_use]
262    pub fn preprocessing(mut self, preprocessing: PreprocessingOptions) -> Self {
263        self.0.preprocessing = preprocessing;
264        self
265    }
266
267    // Encoding
268    builder_setter_into!(encoding, String);
269
270    // Debug
271    builder_setter!(debug, bool);
272
273    /// Build the final [`ConversionOptions`].
274    #[must_use]
275    pub fn build(self) -> ConversionOptions {
276        self.0
277    }
278}
279
280// ── ConversionOptionsUpdate (for binding crate compatibility) ────────────
281
282use crate::options::preprocessing::PreprocessingOptionsUpdate;
283
284/// Partial update for `ConversionOptions`.
285///
286/// Uses `Option<T>` fields for selective updates. Bindings use this to construct
287/// options from language-native types. Prefer [`ConversionOptionsBuilder`] for Rust code.
288#[derive(Debug, Clone, Default)]
289#[cfg_attr(
290    any(feature = "serde", feature = "metadata"),
291    derive(serde::Serialize, serde::Deserialize)
292)]
293#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
294pub struct ConversionOptionsUpdate {
295    /// Optional override for [`ConversionOptions::heading_style`].
296    pub heading_style: Option<HeadingStyle>,
297    /// Optional override for [`ConversionOptions::list_indent_type`].
298    pub list_indent_type: Option<ListIndentType>,
299    /// Optional override for [`ConversionOptions::list_indent_width`].
300    pub list_indent_width: Option<usize>,
301    /// Optional override for [`ConversionOptions::bullets`].
302    pub bullets: Option<String>,
303    /// Optional override for [`ConversionOptions::strong_em_symbol`].
304    pub strong_em_symbol: Option<char>,
305    /// Optional override for [`ConversionOptions::escape_asterisks`].
306    pub escape_asterisks: Option<bool>,
307    /// Optional override for [`ConversionOptions::escape_underscores`].
308    pub escape_underscores: Option<bool>,
309    /// Optional override for [`ConversionOptions::escape_misc`].
310    pub escape_misc: Option<bool>,
311    /// Optional override for [`ConversionOptions::escape_ascii`].
312    pub escape_ascii: Option<bool>,
313    /// Optional override for [`ConversionOptions::code_language`].
314    pub code_language: Option<String>,
315    /// Optional override for [`ConversionOptions::autolinks`].
316    pub autolinks: Option<bool>,
317    /// Optional override for [`ConversionOptions::default_title`].
318    pub default_title: Option<bool>,
319    /// Optional override for [`ConversionOptions::br_in_tables`].
320    pub br_in_tables: Option<bool>,
321    /// Optional override for [`ConversionOptions::highlight_style`].
322    pub highlight_style: Option<HighlightStyle>,
323    /// Optional override for [`ConversionOptions::extract_metadata`].
324    pub extract_metadata: Option<bool>,
325    /// Optional override for [`ConversionOptions::whitespace_mode`].
326    pub whitespace_mode: Option<WhitespaceMode>,
327    /// Optional override for [`ConversionOptions::strip_newlines`].
328    pub strip_newlines: Option<bool>,
329    /// Optional override for [`ConversionOptions::wrap`].
330    pub wrap: Option<bool>,
331    /// Optional override for [`ConversionOptions::wrap_width`].
332    pub wrap_width: Option<usize>,
333    /// Optional override for [`ConversionOptions::convert_as_inline`].
334    pub convert_as_inline: Option<bool>,
335    /// Optional override for [`ConversionOptions::sub_symbol`].
336    pub sub_symbol: Option<String>,
337    /// Optional override for [`ConversionOptions::sup_symbol`].
338    pub sup_symbol: Option<String>,
339    /// Optional override for [`ConversionOptions::newline_style`].
340    pub newline_style: Option<NewlineStyle>,
341    /// Optional override for [`ConversionOptions::code_block_style`].
342    pub code_block_style: Option<CodeBlockStyle>,
343    /// Optional override for [`ConversionOptions::keep_inline_images_in`].
344    pub keep_inline_images_in: Option<Vec<String>>,
345    /// Optional override for [`ConversionOptions::preprocessing`].
346    pub preprocessing: Option<PreprocessingOptionsUpdate>,
347    /// Optional override for [`ConversionOptions::encoding`].
348    pub encoding: Option<String>,
349    /// Optional override for [`ConversionOptions::debug`].
350    pub debug: Option<bool>,
351    /// Optional override for [`ConversionOptions::strip_tags`].
352    pub strip_tags: Option<Vec<String>>,
353    /// Optional override for [`ConversionOptions::preserve_tags`].
354    pub preserve_tags: Option<Vec<String>>,
355    /// Optional override for [`ConversionOptions::skip_images`].
356    pub skip_images: Option<bool>,
357    /// Optional override for [`ConversionOptions::link_style`].
358    pub link_style: Option<LinkStyle>,
359    /// Optional override for [`ConversionOptions::output_format`].
360    pub output_format: Option<OutputFormat>,
361    /// Optional override for [`ConversionOptions::include_document_structure`].
362    pub include_document_structure: Option<bool>,
363    /// Optional override for [`ConversionOptions::extract_images`].
364    pub extract_images: Option<bool>,
365    /// Optional override for [`ConversionOptions::max_image_size`].
366    pub max_image_size: Option<u64>,
367    /// Optional override for [`ConversionOptions::capture_svg`].
368    pub capture_svg: Option<bool>,
369    /// Optional override for [`ConversionOptions::infer_dimensions`].
370    pub infer_dimensions: Option<bool>,
371}
372
373impl ConversionOptions {
374    /// Apply a partial update to these conversion options.
375    pub fn apply_update(&mut self, update: ConversionOptionsUpdate) {
376        macro_rules! apply {
377            ($field:ident) => {
378                if let Some(v) = update.$field {
379                    self.$field = v;
380                }
381            };
382        }
383        apply!(heading_style);
384        apply!(list_indent_type);
385        apply!(list_indent_width);
386        apply!(bullets);
387        apply!(strong_em_symbol);
388        apply!(escape_asterisks);
389        apply!(escape_underscores);
390        apply!(escape_misc);
391        apply!(escape_ascii);
392        apply!(code_language);
393        apply!(autolinks);
394        apply!(default_title);
395        apply!(br_in_tables);
396        apply!(highlight_style);
397        apply!(extract_metadata);
398        apply!(whitespace_mode);
399        apply!(strip_newlines);
400        apply!(wrap);
401        apply!(wrap_width);
402        apply!(convert_as_inline);
403        apply!(sub_symbol);
404        apply!(sup_symbol);
405        apply!(newline_style);
406        apply!(code_block_style);
407        apply!(keep_inline_images_in);
408        apply!(encoding);
409        apply!(debug);
410        apply!(strip_tags);
411        apply!(preserve_tags);
412        apply!(skip_images);
413        apply!(link_style);
414        apply!(output_format);
415        apply!(include_document_structure);
416        apply!(extract_images);
417        apply!(max_image_size);
418        apply!(capture_svg);
419        apply!(infer_dimensions);
420        if let Some(preprocessing) = update.preprocessing {
421            self.preprocessing.apply_update(preprocessing);
422        }
423    }
424
425    /// Create from a partial update, applying to defaults.
426    #[must_use]
427    pub fn from_update(update: ConversionOptionsUpdate) -> Self {
428        let mut options = Self::default();
429        options.apply_update(update);
430        options
431    }
432}
433
434impl From<ConversionOptionsUpdate> for ConversionOptions {
435    fn from(update: ConversionOptionsUpdate) -> Self {
436        Self::from_update(update)
437    }
438}
439
440// ── Tests ───────────────────────────────────────────────────────────────────
441
442#[cfg(all(test, any(feature = "serde", feature = "metadata")))]
443mod tests {
444    use super::*;
445
446    #[test]
447    fn test_conversion_options_serde() {
448        let options = ConversionOptions::builder()
449            .heading_style(HeadingStyle::AtxClosed)
450            .list_indent_width(4)
451            .bullets("*")
452            .escape_asterisks(true)
453            .whitespace_mode(WhitespaceMode::Strict)
454            .build();
455
456        let json = serde_json::to_string(&options).expect("Failed to serialize");
457        let deserialized: ConversionOptions = serde_json::from_str(&json).expect("Failed to deserialize");
458
459        assert_eq!(deserialized.list_indent_width, 4);
460        assert_eq!(deserialized.bullets, "*");
461        assert!(deserialized.escape_asterisks);
462        assert_eq!(deserialized.heading_style, HeadingStyle::AtxClosed);
463        assert_eq!(deserialized.whitespace_mode, WhitespaceMode::Strict);
464    }
465
466    #[test]
467    fn test_conversion_options_partial_deserialization() {
468        let partial_json = r#"{
469            "heading_style": "atxclosed",
470            "list_indent_width": 4,
471            "bullets": "*"
472        }"#;
473
474        let deserialized: ConversionOptions =
475            serde_json::from_str(partial_json).expect("Failed to deserialize partial JSON");
476
477        assert_eq!(deserialized.heading_style, HeadingStyle::AtxClosed);
478        assert_eq!(deserialized.list_indent_width, 4);
479        assert_eq!(deserialized.bullets, "*");
480        assert!(!deserialized.escape_asterisks);
481        assert!(!deserialized.escape_underscores);
482        assert_eq!(deserialized.list_indent_type, ListIndentType::Spaces);
483    }
484
485    #[test]
486    fn test_builder_pattern() {
487        let options = ConversionOptions::builder()
488            .heading_style(HeadingStyle::Underlined)
489            .wrap(true)
490            .wrap_width(100)
491            .include_document_structure(true)
492            .extract_images(true)
493            .build();
494
495        assert_eq!(options.heading_style, HeadingStyle::Underlined);
496        assert!(options.wrap);
497        assert_eq!(options.wrap_width, 100);
498        assert!(options.include_document_structure);
499        assert!(options.extract_images);
500    }
501}