Skip to main content

html_to_markdown_rs/options/
preprocessing.rs

1#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2
3//! HTML preprocessing configuration options.
4//!
5//! This module provides configuration for document cleanup before conversion,
6//! including preset levels and granular control over element removal.
7
8use crate::options::validation::normalize_token;
9
10/// HTML preprocessing aggressiveness level.
11///
12/// Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
14pub enum PreprocessingPreset {
15    /// Minimal cleanup. Remove only essential noise (scripts, styles).
16    Minimal,
17    /// Standard cleanup. Default. Removes navigation, forms, and other auxiliary content.
18    #[default]
19    Standard,
20    /// Aggressive cleanup. Remove extensive non-content elements and structure.
21    Aggressive,
22}
23
24impl PreprocessingPreset {
25    /// Parse a preprocessing preset from a string.
26    ///
27    /// Accepts "minimal", "aggressive", or defaults to Standard.
28    /// Input is normalized (lowercased, alphanumeric only).
29    #[must_use]
30    pub fn parse(value: &str) -> Self {
31        match normalize_token(value).as_str() {
32            "minimal" => Self::Minimal,
33            "aggressive" => Self::Aggressive,
34            _ => Self::Standard,
35        }
36    }
37}
38
39/// HTML preprocessing options for document cleanup before conversion.
40#[derive(Debug, Clone)]
41#[cfg_attr(
42    any(feature = "serde", feature = "metadata"),
43    derive(serde::Serialize, serde::Deserialize)
44)]
45#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
46pub struct PreprocessingOptions {
47    /// Enable HTML preprocessing globally
48    pub enabled: bool,
49
50    /// Preprocessing preset level (Minimal, Standard, Aggressive)
51    pub preset: PreprocessingPreset,
52
53    /// Remove navigation elements (nav, breadcrumbs, menus, sidebars)
54    pub remove_navigation: bool,
55
56    /// Remove form elements (forms, inputs, buttons, etc.)
57    pub remove_forms: bool,
58}
59
60/// Partial update for `PreprocessingOptions`.
61///
62/// This struct uses `Option<T>` to represent optional fields that can be selectively updated.
63/// Only specified fields (Some values) will override existing options; None values leave the
64/// corresponding fields unchanged when applied via [`PreprocessingOptions::apply_update`].
65#[derive(Debug, Clone, Default)]
66#[cfg_attr(
67    any(feature = "serde", feature = "metadata"),
68    derive(serde::Serialize, serde::Deserialize)
69)]
70#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
71pub struct PreprocessingOptionsUpdate {
72    /// Optional global preprocessing enablement override
73    pub enabled: Option<bool>,
74
75    /// Optional preprocessing preset level override (Minimal, Standard, Aggressive)
76    pub preset: Option<PreprocessingPreset>,
77
78    /// Optional navigation element removal override (nav, breadcrumbs, menus, sidebars)
79    pub remove_navigation: Option<bool>,
80
81    /// Optional form element removal override (forms, inputs, buttons, etc.)
82    pub remove_forms: Option<bool>,
83}
84
85impl Default for PreprocessingOptions {
86    fn default() -> Self {
87        Self {
88            enabled: false,
89            preset: PreprocessingPreset::default(),
90            remove_navigation: true,
91            remove_forms: true,
92        }
93    }
94}
95
96impl PreprocessingOptions {
97    /// Apply a partial update to these preprocessing options.
98    ///
99    /// Any specified fields in the update will override the current values.
100    /// Unspecified fields (None) are left unchanged.
101    ///
102    /// # Arguments
103    ///
104    /// * `update` - Partial preprocessing options update
105    #[allow(clippy::needless_pass_by_value)]
106    pub const fn apply_update(&mut self, update: PreprocessingOptionsUpdate) {
107        if let Some(enabled) = update.enabled {
108            self.enabled = enabled;
109        }
110        if let Some(preset) = update.preset {
111            self.preset = preset;
112        }
113        if let Some(remove_navigation) = update.remove_navigation {
114            self.remove_navigation = remove_navigation;
115        }
116        if let Some(remove_forms) = update.remove_forms {
117            self.remove_forms = remove_forms;
118        }
119    }
120
121    /// Create new preprocessing options from a partial update.
122    ///
123    /// Creates a new `PreprocessingOptions` struct with defaults, then applies the update.
124    /// Fields not specified in the update keep their default values.
125    ///
126    /// # Arguments
127    ///
128    /// * `update` - Partial preprocessing options update
129    ///
130    /// # Returns
131    ///
132    /// New `PreprocessingOptions` with specified updates applied to defaults
133    #[must_use]
134    pub fn from_update(update: PreprocessingOptionsUpdate) -> Self {
135        let mut options = Self::default();
136        options.apply_update(update);
137        options
138    }
139}
140
141impl From<PreprocessingOptionsUpdate> for PreprocessingOptions {
142    fn from(update: PreprocessingOptionsUpdate) -> Self {
143        Self::from_update(update)
144    }
145}
146
147#[cfg(any(feature = "serde", feature = "metadata"))]
148mod serde_impls {
149    use super::PreprocessingPreset;
150    use serde::{Deserialize, Serializer};
151
152    impl<'de> Deserialize<'de> for PreprocessingPreset {
153        fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
154        where
155            D: serde::Deserializer<'de>,
156        {
157            let value = String::deserialize(deserializer)?;
158            Ok(Self::parse(&value))
159        }
160    }
161
162    impl serde::Serialize for PreprocessingPreset {
163        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
164        where
165            S: Serializer,
166        {
167            let s = match self {
168                Self::Minimal => "minimal",
169                Self::Standard => "standard",
170                Self::Aggressive => "aggressive",
171            };
172            serializer.serialize_str(s)
173        }
174    }
175}
176
177#[cfg(all(test, any(feature = "serde", feature = "metadata")))]
178mod tests {
179    use super::*;
180
181    #[test]
182    fn test_preprocessing_options_serde() {
183        let mut options = PreprocessingOptions::default();
184        options.enabled = true;
185        options.preset = PreprocessingPreset::Aggressive;
186        options.remove_navigation = false;
187
188        // Serialize to JSON
189        let json = serde_json::to_string(&options).expect("Failed to serialize");
190
191        // Deserialize back
192        let deserialized: PreprocessingOptions = serde_json::from_str(&json).expect("Failed to deserialize");
193
194        // Verify values
195        assert_eq!(deserialized.enabled, true);
196        assert_eq!(deserialized.preset, PreprocessingPreset::Aggressive);
197        assert_eq!(deserialized.remove_navigation, false);
198    }
199}