html_to_markdown_rs/metadata/config.rs
1//! Metadata extraction configuration.
2
3/// Default maximum size for structured data extraction (1 MB)
4pub const DEFAULT_MAX_STRUCTURED_DATA_SIZE: usize = 1_000_000;
5
6/// Configuration for metadata extraction granularity.
7///
8/// Controls which metadata types are extracted and size limits for safety.
9/// Enables selective extraction of different metadata categories from HTML documents,
10/// allowing fine-grained control over which types of information to collect during
11/// the HTML-to-Markdown conversion process.
12///
13/// # Fields
14///
15/// - `extract_document`: Enable document-level metadata extraction (title, description, author, Open Graph, Twitter Card, etc.)
16/// - `extract_headers`: Enable heading element extraction (h1-h6) with hierarchy tracking
17/// - `extract_links`: Enable anchor element extraction with link type classification
18/// - `extract_images`: Enable image element extraction with source and dimension metadata
19/// - `extract_structured_data`: Enable structured data extraction (JSON-LD, Microdata, RDFa)
20/// - `max_structured_data_size`: Safety limit on total structured data size in bytes
21///
22/// # Examples
23///
24/// ```
25/// # use html_to_markdown_rs::metadata::MetadataConfig;
26/// let config = MetadataConfig {
27/// extract_document: true,
28/// extract_headers: true,
29/// extract_links: true,
30/// extract_images: true,
31/// extract_structured_data: true,
32/// max_structured_data_size: 1_000_000,
33/// };
34///
35/// assert!(config.extract_headers);
36/// ```
37#[derive(Debug, Clone)]
38#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
39pub struct MetadataConfig {
40 /// Extract document-level metadata (title, description, author, etc.).
41 ///
42 /// When enabled, collects metadata from `<head>` section including:
43 /// - `<title>` element content
44 /// - `<meta name="description">` and other standard meta tags
45 /// - Open Graph (og:*) properties for social media optimization
46 /// - Twitter Card (twitter:*) properties
47 /// - Language and text direction attributes
48 /// - Canonical URL and base href references
49 pub extract_document: bool,
50
51 /// Extract h1-h6 header elements and their hierarchy.
52 ///
53 /// When enabled, collects all heading elements with:
54 /// - Header level (1-6)
55 /// - Text content (normalized)
56 /// - HTML id attribute if present
57 /// - Document tree depth for hierarchy tracking
58 /// - Byte offset in original HTML for positioning
59 pub extract_headers: bool,
60
61 /// Extract anchor (a) elements as links with type classification.
62 ///
63 /// When enabled, collects all hyperlinks with:
64 /// - href attribute value
65 /// - Link text content
66 /// - Title attribute (tooltip text)
67 /// - Automatic link type classification (anchor, internal, external, email, phone, other)
68 /// - Rel attribute values
69 /// - Additional custom attributes
70 pub extract_links: bool,
71
72 /// Extract image elements and data URIs.
73 ///
74 /// When enabled, collects all image elements with:
75 /// - Source URL or data URI
76 /// - Alt text for accessibility
77 /// - Title attribute
78 /// - Dimensions (width, height) if available
79 /// - Automatic image type classification (data URI, external, relative, inline SVG)
80 /// - Additional custom attributes
81 pub extract_images: bool,
82
83 /// Extract structured data (JSON-LD, Microdata, RDFa).
84 ///
85 /// When enabled, collects machine-readable structured data including:
86 /// - JSON-LD script blocks with schema detection
87 /// - Microdata attributes (itemscope, itemtype, itemprop)
88 /// - RDFa markup
89 /// - Extracted schema type if detectable
90 pub extract_structured_data: bool,
91
92 /// Maximum total size of structured data to collect (bytes).
93 ///
94 /// Prevents memory exhaustion attacks on malformed or adversarial documents
95 /// containing excessively large structured data blocks. When the accumulated
96 /// size of structured data exceeds this limit, further collection stops.
97 /// Default: `1_000_000` bytes (1 MB)
98 pub max_structured_data_size: usize,
99}
100
101/// Partial update for `MetadataConfig`.
102///
103/// This struct uses `Option<T>` to represent optional fields that can be selectively updated.
104/// Only specified fields (Some values) will override existing config; None values leave the
105/// corresponding fields unchanged when applied via [`MetadataConfig::apply_update`].
106///
107/// # Fields
108///
109/// - `extract_document`: Optional override for document-level metadata extraction
110/// - `extract_headers`: Optional override for heading element extraction
111/// - `extract_links`: Optional override for link element extraction
112/// - `extract_images`: Optional override for image element extraction
113/// - `extract_structured_data`: Optional override for structured data extraction
114/// - `max_structured_data_size`: Optional override for structured data size limit
115///
116/// # Examples
117///
118/// ```
119/// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
120/// let update = MetadataConfigUpdate {
121/// extract_document: Some(false),
122/// extract_headers: Some(true),
123/// extract_links: None, // No change
124/// extract_images: None, // No change
125/// extract_structured_data: None, // No change
126/// max_structured_data_size: None, // No change
127/// };
128///
129/// let mut config = MetadataConfig::default();
130/// config.apply_update(update);
131/// assert!(!config.extract_document);
132/// assert!(config.extract_headers);
133/// ```
134#[derive(Debug, Clone, Default)]
135#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
136#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
137pub struct MetadataConfigUpdate {
138 /// Optional override for extracting document-level metadata.
139 ///
140 /// When Some(true), enables document metadata extraction; Some(false) disables it.
141 /// None leaves the current setting unchanged.
142 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_document"))]
143 pub extract_document: Option<bool>,
144
145 /// Optional override for extracting heading elements (h1-h6).
146 ///
147 /// When Some(true), enables header extraction; Some(false) disables it.
148 /// None leaves the current setting unchanged.
149 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_headers"))]
150 pub extract_headers: Option<bool>,
151
152 /// Optional override for extracting anchor (link) elements.
153 ///
154 /// When Some(true), enables link extraction; Some(false) disables it.
155 /// None leaves the current setting unchanged.
156 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_links"))]
157 pub extract_links: Option<bool>,
158
159 /// Optional override for extracting image elements.
160 ///
161 /// When Some(true), enables image extraction; Some(false) disables it.
162 /// None leaves the current setting unchanged.
163 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_images"))]
164 pub extract_images: Option<bool>,
165
166 /// Optional override for extracting structured data (JSON-LD, Microdata, RDFa).
167 ///
168 /// When Some(true), enables structured data extraction; Some(false) disables it.
169 /// None leaves the current setting unchanged.
170 #[cfg_attr(
171 any(feature = "serde", feature = "metadata"),
172 serde(alias = "extract_structured_data")
173 )]
174 pub extract_structured_data: Option<bool>,
175
176 /// Optional override for maximum structured data collection size in bytes.
177 ///
178 /// When Some(size), sets the new size limit. None leaves the current limit unchanged.
179 /// Use this to adjust safety thresholds for different documents.
180 #[cfg_attr(
181 any(feature = "serde", feature = "metadata"),
182 serde(alias = "max_structured_data_size")
183 )]
184 pub max_structured_data_size: Option<usize>,
185}
186
187impl Default for MetadataConfig {
188 /// Create default metadata configuration.
189 ///
190 /// Defaults to extracting all metadata types with 1MB limit on structured data.
191 fn default() -> Self {
192 Self {
193 extract_document: true,
194 extract_headers: true,
195 extract_links: true,
196 extract_images: true,
197 extract_structured_data: true,
198 max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
199 }
200 }
201}
202
203impl MetadataConfig {
204 /// Check if any metadata extraction is enabled.
205 ///
206 /// Returns `true` if at least one extraction category is enabled, `false` if all are disabled.
207 /// This is useful for early exit optimization when the application doesn't need metadata.
208 ///
209 /// # Returns
210 ///
211 /// `true` if any of the extraction flags are enabled, `false` if all are disabled.
212 ///
213 /// # Examples
214 ///
215 /// ```
216 /// # use html_to_markdown_rs::metadata::MetadataConfig;
217 /// // All enabled
218 /// let config = MetadataConfig::default();
219 /// assert!(config.any_enabled());
220 ///
221 /// // Selectively enabled
222 /// let config = MetadataConfig {
223 /// extract_headers: true,
224 /// extract_document: false,
225 /// extract_links: false,
226 /// extract_images: false,
227 /// extract_structured_data: false,
228 /// max_structured_data_size: 1_000_000,
229 /// };
230 /// assert!(config.any_enabled());
231 ///
232 /// // All disabled
233 /// let config = MetadataConfig {
234 /// extract_document: false,
235 /// extract_headers: false,
236 /// extract_links: false,
237 /// extract_images: false,
238 /// extract_structured_data: false,
239 /// max_structured_data_size: 1_000_000,
240 /// };
241 /// assert!(!config.any_enabled());
242 /// ```
243 #[must_use]
244 pub const fn any_enabled(&self) -> bool {
245 self.extract_document
246 || self.extract_headers
247 || self.extract_links
248 || self.extract_images
249 || self.extract_structured_data
250 }
251
252 /// Apply a partial update to this metadata configuration.
253 ///
254 /// Any specified fields in the update (Some values) will override the current values.
255 /// Unspecified fields (None) are left unchanged. This allows selective modification
256 /// of configuration without affecting unrelated settings.
257 ///
258 /// # Arguments
259 ///
260 /// * `update` - Partial metadata config update with fields to override
261 ///
262 /// # Examples
263 ///
264 /// ```
265 /// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
266 /// let mut config = MetadataConfig::default();
267 /// // config starts with all extraction enabled
268 ///
269 /// let update = MetadataConfigUpdate {
270 /// extract_document: Some(false),
271 /// extract_images: Some(false),
272 /// // All other fields are None, so they won't change
273 /// ..Default::default()
274 /// };
275 ///
276 /// config.apply_update(update);
277 ///
278 /// assert!(!config.extract_document);
279 /// assert!(!config.extract_images);
280 /// assert!(config.extract_headers); // Unchanged
281 /// assert!(config.extract_links); // Unchanged
282 /// ```
283 pub const fn apply_update(&mut self, update: MetadataConfigUpdate) {
284 if let Some(extract_document) = update.extract_document {
285 self.extract_document = extract_document;
286 }
287 if let Some(extract_headers) = update.extract_headers {
288 self.extract_headers = extract_headers;
289 }
290 if let Some(extract_links) = update.extract_links {
291 self.extract_links = extract_links;
292 }
293 if let Some(extract_images) = update.extract_images {
294 self.extract_images = extract_images;
295 }
296 if let Some(extract_structured_data) = update.extract_structured_data {
297 self.extract_structured_data = extract_structured_data;
298 }
299 if let Some(max_structured_data_size) = update.max_structured_data_size {
300 self.max_structured_data_size = max_structured_data_size;
301 }
302 }
303
304 /// Create new metadata configuration from a partial update.
305 ///
306 /// Creates a new `MetadataConfig` struct with defaults, then applies the update.
307 /// Fields not specified in the update (None) keep their default values.
308 /// This is a convenience method for constructing a configuration from a partial specification
309 /// without needing to explicitly call `.default()` first.
310 ///
311 /// # Arguments
312 ///
313 /// * `update` - Partial metadata config update with fields to set
314 ///
315 /// # Returns
316 ///
317 /// New `MetadataConfig` with specified updates applied to defaults
318 ///
319 /// # Examples
320 ///
321 /// ```
322 /// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
323 /// let update = MetadataConfigUpdate {
324 /// extract_document: Some(false),
325 /// extract_headers: Some(true),
326 /// extract_links: Some(true),
327 /// extract_images: None, // Will use default (true)
328 /// extract_structured_data: None, // Will use default (true)
329 /// max_structured_data_size: None, // Will use default (1MB)
330 /// };
331 ///
332 /// let config = MetadataConfig::from_update(update);
333 ///
334 /// assert!(!config.extract_document);
335 /// assert!(config.extract_headers);
336 /// assert!(config.extract_links);
337 /// assert!(config.extract_images); // Default
338 /// assert!(config.extract_structured_data); // Default
339 /// ```
340 #[must_use]
341 pub fn from_update(update: MetadataConfigUpdate) -> Self {
342 let mut config = Self::default();
343 config.apply_update(update);
344 config
345 }
346}
347
348impl From<MetadataConfigUpdate> for MetadataConfig {
349 fn from(update: MetadataConfigUpdate) -> Self {
350 Self::from_update(update)
351 }
352}
353
354#[cfg(test)]
355mod tests {
356 use super::*;
357
358 #[test]
359 fn test_metadata_config_default() {
360 let config = MetadataConfig::default();
361
362 assert!(config.extract_headers);
363 assert!(config.extract_links);
364 assert!(config.extract_images);
365 assert!(config.extract_structured_data);
366 assert_eq!(config.max_structured_data_size, DEFAULT_MAX_STRUCTURED_DATA_SIZE);
367 }
368
369 #[test]
370 fn test_metadata_config_any_enabled() {
371 let all_enabled = MetadataConfig::default();
372 assert!(all_enabled.any_enabled());
373
374 let some_enabled = MetadataConfig {
375 extract_headers: true,
376 extract_document: false,
377 extract_links: false,
378 extract_images: false,
379 extract_structured_data: false,
380 max_structured_data_size: 1_000_000,
381 };
382 assert!(some_enabled.any_enabled());
383
384 let none_enabled = MetadataConfig {
385 extract_document: false,
386 extract_headers: false,
387 extract_links: false,
388 extract_images: false,
389 extract_structured_data: false,
390 max_structured_data_size: 1_000_000,
391 };
392 assert!(!none_enabled.any_enabled());
393 }
394}