Skip to main content

html_to_markdown_rs/metadata/
config.rs

1//! Metadata extraction configuration.
2
3/// Default maximum size for structured data extraction (1 MB)
4pub const DEFAULT_MAX_STRUCTURED_DATA_SIZE: usize = 1_000_000;
5
6/// Configuration for metadata extraction granularity.
7///
8/// Controls which metadata types are extracted and size limits for safety.
9/// Enables selective extraction of different metadata categories from HTML documents,
10/// allowing fine-grained control over which types of information to collect during
11/// the HTML-to-Markdown conversion process.
12///
13/// # Fields
14///
15/// - `extract_document`: Enable document-level metadata extraction (title, description, author, Open Graph, Twitter Card, etc.)
16/// - `extract_headers`: Enable heading element extraction (h1-h6) with hierarchy tracking
17/// - `extract_links`: Enable anchor element extraction with link type classification
18/// - `extract_images`: Enable image element extraction with source and dimension metadata
19/// - `extract_structured_data`: Enable structured data extraction (JSON-LD, Microdata, RDFa)
20/// - `max_structured_data_size`: Safety limit on total structured data size in bytes
21///
22/// # Examples
23///
24/// ```
25/// # use html_to_markdown_rs::metadata::MetadataConfig;
26/// let config = MetadataConfig {
27///     extract_document: true,
28///     extract_headers: true,
29///     extract_links: true,
30///     extract_images: true,
31///     extract_structured_data: true,
32///     max_structured_data_size: 1_000_000,
33/// };
34///
35/// assert!(config.extract_headers);
36/// ```
37#[derive(Debug, Clone)]
38#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
39pub struct MetadataConfig {
40    /// Extract document-level metadata (title, description, author, etc.).
41    ///
42    /// When enabled, collects metadata from `<head>` section including:
43    /// - `<title>` element content
44    /// - `<meta name="description">` and other standard meta tags
45    /// - Open Graph (og:*) properties for social media optimization
46    /// - Twitter Card (twitter:*) properties
47    /// - Language and text direction attributes
48    /// - Canonical URL and base href references
49    pub extract_document: bool,
50
51    /// Extract h1-h6 header elements and their hierarchy.
52    ///
53    /// When enabled, collects all heading elements with:
54    /// - Header level (1-6)
55    /// - Text content (normalized)
56    /// - HTML id attribute if present
57    /// - Document tree depth for hierarchy tracking
58    /// - Byte offset in original HTML for positioning
59    pub extract_headers: bool,
60
61    /// Extract anchor (a) elements as links with type classification.
62    ///
63    /// When enabled, collects all hyperlinks with:
64    /// - href attribute value
65    /// - Link text content
66    /// - Title attribute (tooltip text)
67    /// - Automatic link type classification (anchor, internal, external, email, phone, other)
68    /// - Rel attribute values
69    /// - Additional custom attributes
70    pub extract_links: bool,
71
72    /// Extract image elements and data URIs.
73    ///
74    /// When enabled, collects all image elements with:
75    /// - Source URL or data URI
76    /// - Alt text for accessibility
77    /// - Title attribute
78    /// - Dimensions (width, height) if available
79    /// - Automatic image type classification (data URI, external, relative, inline SVG)
80    /// - Additional custom attributes
81    pub extract_images: bool,
82
83    /// Extract structured data (JSON-LD, Microdata, RDFa).
84    ///
85    /// When enabled, collects machine-readable structured data including:
86    /// - JSON-LD script blocks with schema detection
87    /// - Microdata attributes (itemscope, itemtype, itemprop)
88    /// - RDFa markup
89    /// - Extracted schema type if detectable
90    pub extract_structured_data: bool,
91
92    /// Maximum total size of structured data to collect (bytes).
93    ///
94    /// Prevents memory exhaustion attacks on malformed or adversarial documents
95    /// containing excessively large structured data blocks. When the accumulated
96    /// size of structured data exceeds this limit, further collection stops.
97    /// Default: `1_000_000` bytes (1 MB)
98    pub max_structured_data_size: usize,
99}
100
101/// Partial update for `MetadataConfig`.
102///
103/// This struct uses `Option<T>` to represent optional fields that can be selectively updated.
104/// Only specified fields (Some values) will override existing config; None values leave the
105/// corresponding fields unchanged when applied via [`MetadataConfig::apply_update`].
106///
107/// # Fields
108///
109/// - `extract_document`: Optional override for document-level metadata extraction
110/// - `extract_headers`: Optional override for heading element extraction
111/// - `extract_links`: Optional override for link element extraction
112/// - `extract_images`: Optional override for image element extraction
113/// - `extract_structured_data`: Optional override for structured data extraction
114/// - `max_structured_data_size`: Optional override for structured data size limit
115///
116/// # Examples
117///
118/// ```
119/// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
120/// let update = MetadataConfigUpdate {
121///     extract_document: Some(false),
122///     extract_headers: Some(true),
123///     extract_links: None,  // No change
124///     extract_images: None,  // No change
125///     extract_structured_data: None,  // No change
126///     max_structured_data_size: None,  // No change
127/// };
128///
129/// let mut config = MetadataConfig::default();
130/// config.apply_update(update);
131/// assert!(!config.extract_document);
132/// assert!(config.extract_headers);
133/// ```
134#[derive(Debug, Clone, Default)]
135#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
136#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
137pub struct MetadataConfigUpdate {
138    /// Optional override for extracting document-level metadata.
139    ///
140    /// When Some(true), enables document metadata extraction; Some(false) disables it.
141    /// None leaves the current setting unchanged.
142    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_document"))]
143    pub extract_document: Option<bool>,
144
145    /// Optional override for extracting heading elements (h1-h6).
146    ///
147    /// When Some(true), enables header extraction; Some(false) disables it.
148    /// None leaves the current setting unchanged.
149    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_headers"))]
150    pub extract_headers: Option<bool>,
151
152    /// Optional override for extracting anchor (link) elements.
153    ///
154    /// When Some(true), enables link extraction; Some(false) disables it.
155    /// None leaves the current setting unchanged.
156    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_links"))]
157    pub extract_links: Option<bool>,
158
159    /// Optional override for extracting image elements.
160    ///
161    /// When Some(true), enables image extraction; Some(false) disables it.
162    /// None leaves the current setting unchanged.
163    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_images"))]
164    pub extract_images: Option<bool>,
165
166    /// Optional override for extracting structured data (JSON-LD, Microdata, RDFa).
167    ///
168    /// When Some(true), enables structured data extraction; Some(false) disables it.
169    /// None leaves the current setting unchanged.
170    #[cfg_attr(
171        any(feature = "serde", feature = "metadata"),
172        serde(alias = "extract_structured_data")
173    )]
174    pub extract_structured_data: Option<bool>,
175
176    /// Optional override for maximum structured data collection size in bytes.
177    ///
178    /// When Some(size), sets the new size limit. None leaves the current limit unchanged.
179    /// Use this to adjust safety thresholds for different documents.
180    #[cfg_attr(
181        any(feature = "serde", feature = "metadata"),
182        serde(alias = "max_structured_data_size")
183    )]
184    pub max_structured_data_size: Option<usize>,
185}
186
187impl Default for MetadataConfig {
188    /// Create default metadata configuration.
189    ///
190    /// Defaults to extracting all metadata types with 1MB limit on structured data.
191    fn default() -> Self {
192        Self {
193            extract_document: true,
194            extract_headers: true,
195            extract_links: true,
196            extract_images: true,
197            extract_structured_data: true,
198            max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
199        }
200    }
201}
202
203impl MetadataConfig {
204    /// Check if any metadata extraction is enabled.
205    ///
206    /// Returns `true` if at least one extraction category is enabled, `false` if all are disabled.
207    /// This is useful for early exit optimization when the application doesn't need metadata.
208    ///
209    /// # Returns
210    ///
211    /// `true` if any of the extraction flags are enabled, `false` if all are disabled.
212    ///
213    /// # Examples
214    ///
215    /// ```
216    /// # use html_to_markdown_rs::metadata::MetadataConfig;
217    /// // All enabled
218    /// let config = MetadataConfig::default();
219    /// assert!(config.any_enabled());
220    ///
221    /// // Selectively enabled
222    /// let config = MetadataConfig {
223    ///     extract_headers: true,
224    ///     extract_document: false,
225    ///     extract_links: false,
226    ///     extract_images: false,
227    ///     extract_structured_data: false,
228    ///     max_structured_data_size: 1_000_000,
229    /// };
230    /// assert!(config.any_enabled());
231    ///
232    /// // All disabled
233    /// let config = MetadataConfig {
234    ///     extract_document: false,
235    ///     extract_headers: false,
236    ///     extract_links: false,
237    ///     extract_images: false,
238    ///     extract_structured_data: false,
239    ///     max_structured_data_size: 1_000_000,
240    /// };
241    /// assert!(!config.any_enabled());
242    /// ```
243    #[must_use]
244    pub const fn any_enabled(&self) -> bool {
245        self.extract_document
246            || self.extract_headers
247            || self.extract_links
248            || self.extract_images
249            || self.extract_structured_data
250    }
251
252    /// Apply a partial update to this metadata configuration.
253    ///
254    /// Any specified fields in the update (Some values) will override the current values.
255    /// Unspecified fields (None) are left unchanged. This allows selective modification
256    /// of configuration without affecting unrelated settings.
257    ///
258    /// # Arguments
259    ///
260    /// * `update` - Partial metadata config update with fields to override
261    ///
262    /// # Examples
263    ///
264    /// ```
265    /// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
266    /// let mut config = MetadataConfig::default();
267    /// // config starts with all extraction enabled
268    ///
269    /// let update = MetadataConfigUpdate {
270    ///     extract_document: Some(false),
271    ///     extract_images: Some(false),
272    ///     // All other fields are None, so they won't change
273    ///     ..Default::default()
274    /// };
275    ///
276    /// config.apply_update(update);
277    ///
278    /// assert!(!config.extract_document);
279    /// assert!(!config.extract_images);
280    /// assert!(config.extract_headers);  // Unchanged
281    /// assert!(config.extract_links);    // Unchanged
282    /// ```
283    pub const fn apply_update(&mut self, update: MetadataConfigUpdate) {
284        if let Some(extract_document) = update.extract_document {
285            self.extract_document = extract_document;
286        }
287        if let Some(extract_headers) = update.extract_headers {
288            self.extract_headers = extract_headers;
289        }
290        if let Some(extract_links) = update.extract_links {
291            self.extract_links = extract_links;
292        }
293        if let Some(extract_images) = update.extract_images {
294            self.extract_images = extract_images;
295        }
296        if let Some(extract_structured_data) = update.extract_structured_data {
297            self.extract_structured_data = extract_structured_data;
298        }
299        if let Some(max_structured_data_size) = update.max_structured_data_size {
300            self.max_structured_data_size = max_structured_data_size;
301        }
302    }
303
304    /// Create new metadata configuration from a partial update.
305    ///
306    /// Creates a new `MetadataConfig` struct with defaults, then applies the update.
307    /// Fields not specified in the update (None) keep their default values.
308    /// This is a convenience method for constructing a configuration from a partial specification
309    /// without needing to explicitly call `.default()` first.
310    ///
311    /// # Arguments
312    ///
313    /// * `update` - Partial metadata config update with fields to set
314    ///
315    /// # Returns
316    ///
317    /// New `MetadataConfig` with specified updates applied to defaults
318    ///
319    /// # Examples
320    ///
321    /// ```
322    /// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
323    /// let update = MetadataConfigUpdate {
324    ///     extract_document: Some(false),
325    ///     extract_headers: Some(true),
326    ///     extract_links: Some(true),
327    ///     extract_images: None,  // Will use default (true)
328    ///     extract_structured_data: None,  // Will use default (true)
329    ///     max_structured_data_size: None,  // Will use default (1MB)
330    /// };
331    ///
332    /// let config = MetadataConfig::from_update(update);
333    ///
334    /// assert!(!config.extract_document);
335    /// assert!(config.extract_headers);
336    /// assert!(config.extract_links);
337    /// assert!(config.extract_images);  // Default
338    /// assert!(config.extract_structured_data);  // Default
339    /// ```
340    #[must_use]
341    pub fn from_update(update: MetadataConfigUpdate) -> Self {
342        let mut config = Self::default();
343        config.apply_update(update);
344        config
345    }
346}
347
348impl From<MetadataConfigUpdate> for MetadataConfig {
349    fn from(update: MetadataConfigUpdate) -> Self {
350        Self::from_update(update)
351    }
352}
353
354#[cfg(test)]
355mod tests {
356    use super::*;
357
358    #[test]
359    fn test_metadata_config_default() {
360        let config = MetadataConfig::default();
361
362        assert!(config.extract_headers);
363        assert!(config.extract_links);
364        assert!(config.extract_images);
365        assert!(config.extract_structured_data);
366        assert_eq!(config.max_structured_data_size, DEFAULT_MAX_STRUCTURED_DATA_SIZE);
367    }
368
369    #[test]
370    fn test_metadata_config_any_enabled() {
371        let all_enabled = MetadataConfig::default();
372        assert!(all_enabled.any_enabled());
373
374        let some_enabled = MetadataConfig {
375            extract_headers: true,
376            extract_document: false,
377            extract_links: false,
378            extract_images: false,
379            extract_structured_data: false,
380            max_structured_data_size: 1_000_000,
381        };
382        assert!(some_enabled.any_enabled());
383
384        let none_enabled = MetadataConfig {
385            extract_document: false,
386            extract_headers: false,
387            extract_links: false,
388            extract_images: false,
389            extract_structured_data: false,
390            max_structured_data_size: 1_000_000,
391        };
392        assert!(!none_enabled.any_enabled());
393    }
394}