html_to_markdown_rs/
metadata.rs

1#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2//! Metadata extraction for HTML to Markdown conversion.
3//!
4//! This module provides comprehensive, type-safe metadata extraction during HTML-to-Markdown
5//! conversion, enabling content analysis, SEO optimization, and document indexing workflows.
6//! Metadata includes:
7//! - **Document metadata**: Title, description, author, language, canonical URL, Open Graph, Twitter Card
8//! - **Headers**: Heading elements (h1-h6) with hierarchy, IDs, and positions
9//! - **Links**: Hyperlinks with type classification (anchor, internal, external, email, phone)
10//! - **Images**: Image elements with source, alt text, dimensions, and type (data URI, external, etc.)
11//! - **Structured data**: JSON-LD, Microdata, and `RDFa` blocks
12//!
13//! The implementation follows a single-pass collector pattern for zero-overhead extraction
14//! when metadata features are disabled.
15//!
16//! # Architecture
17//!
18//! Metadata extraction uses the [`MetadataCollector`] pattern (similar to [`InlineImageCollector`]):
19//! - **Single-pass collection**: Metadata is gathered during the primary tree traversal without additional passes
20//! - **Zero overhead when disabled**: Entire module can be compiled out via feature flags
21//! - **Configurable granularity**: Use [`MetadataConfig`] to select which metadata types to extract
22//! - **Type-safe APIs**: All metadata types are enum-based with exhaustive matching
23//! - **Memory-bounded**: Size limits prevent memory exhaustion from adversarial documents
24//! - **Pre-allocated buffers**: Typical documents (32 headers, 64 links, 16 images) handled efficiently
25//!
26//! # Type Overview
27//!
28//! ## Enumerations
29//!
30//! - [`TextDirection`]: Document directionality (LTR, RTL, Auto)
31//! - [`LinkType`]: Link classification (Anchor, Internal, External, Email, Phone, Other)
32//! - [`ImageType`]: Image source type (`DataUri`, External, Relative, `InlineSvg`)
33//! - [`StructuredDataType`]: Structured data format (`JsonLd`, Microdata, `RDFa`)
34//!
35//! ## Structures
36//!
37//! - [`DocumentMetadata`]: Head-level metadata with maps for Open Graph and Twitter Card
38//! - [`HeaderMetadata`]: Heading element with level (1-6), text, ID, hierarchy depth, and position
39//! - [`LinkMetadata`]: Hyperlink with href, text, title, type, rel attributes, and custom attributes
40//! - [`ImageMetadata`]: Image element with src, alt, title, dimensions, type, and attributes
41//! - [`StructuredData`]: Structured data block with type and raw JSON
42//! - [`MetadataConfig`]: Configuration controlling extraction granularity and size limits
43//! - [`ExtendedMetadata`]: Top-level result containing all extracted metadata
44//!
45//! # Examples
46//!
47//! ## Basic Usage with `convert_with_metadata`
48//!
49//! ```ignore
50//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
51//!
52//! let html = r#"
53//!   <html lang="en">
54//!     <head>
55//!       <title>My Article</title>
56//!       <meta name="description" content="An interesting read">
57//!     </head>
58//!     <body>
59//!       <h1 id="main">Title</h1>
60//!       <a href="https://example.com">External Link</a>
61//!       <img src="photo.jpg" alt="A photo">
62//!     </body>
63//!   </html>
64//! "#;
65//!
66//! let config = MetadataConfig::default();
67//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
68//!
69//! // Access document metadata
70//! assert_eq!(metadata.document.title, Some("My Article".to_string()));
71//! assert_eq!(metadata.document.language, Some("en".to_string()));
72//!
73//! // Access headers
74//! assert_eq!(metadata.headers.len(), 1);
75//! assert_eq!(metadata.headers[0].level, 1);
76//! assert_eq!(metadata.headers[0].id, Some("main".to_string()));
77//!
78//! // Access links
79//! assert_eq!(metadata.links.len(), 1);
80//! assert_eq!(metadata.links[0].link_type, LinkType::External);
81//!
82//! // Access images
83//! assert_eq!(metadata.images.len(), 1);
84//! assert_eq!(metadata.images[0].image_type, ImageType::Relative);
85//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
86//! ```
87//!
88//! ## Selective Extraction
89//!
90//! ```ignore
91//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
92//!
93//! let config = MetadataConfig {
94//!     extract_headers: true,
95//!     extract_links: true,
96//!     extract_images: false,  // Skip images
97//!     extract_structured_data: false,  // Skip structured data
98//!     max_structured_data_size: 0,
99//! };
100//!
101//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
102//! assert_eq!(metadata.images.len(), 0);  // Images not extracted
103//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
104//! ```
105//!
106//! ## Analyzing Link Types
107//!
108//! ```ignore
109//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
110//! use html_to_markdown_rs::metadata::LinkType;
111//!
112//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default(), None)?;
113//!
114//! for link in &metadata.links {
115//!     match link.link_type {
116//!         LinkType::External => println!("External: {}", link.href),
117//!         LinkType::Internal => println!("Internal: {}", link.href),
118//!         LinkType::Anchor => println!("Anchor: {}", link.href),
119//!         LinkType::Email => println!("Email: {}", link.href),
120//!         _ => {}
121//!     }
122//! }
123//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
124//! ```
125//!
126//! # Serialization
127//!
128//! All types in this module support serialization via `serde` when the `metadata` feature is enabled.
129//! This enables easy export to JSON, YAML, or other formats:
130//!
131//! ```ignore
132//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
133//!
134//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default(), None)?;
135//! let json = serde_json::to_string_pretty(&metadata)?;
136//! println!("{}", json);
137//! # Ok::<(), Box<dyn std::error::Error>>(())
138//! ```
139
140use std::cell::RefCell;
141use std::collections::BTreeMap;
142use std::rc::Rc;
143
144/// Text directionality of document content.
145///
146/// Corresponds to the HTML `dir` attribute and `bdi` element directionality.
147#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
148#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
149pub enum TextDirection {
150    /// Left-to-right text flow (default for Latin scripts)
151    #[cfg_attr(feature = "metadata", serde(rename = "ltr"))]
152    LeftToRight,
153    /// Right-to-left text flow (Hebrew, Arabic, Urdu, etc.)
154    #[cfg_attr(feature = "metadata", serde(rename = "rtl"))]
155    RightToLeft,
156    /// Automatic directionality detection
157    #[cfg_attr(feature = "metadata", serde(rename = "auto"))]
158    Auto,
159}
160
161impl std::fmt::Display for TextDirection {
162    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
163        match self {
164            Self::LeftToRight => write!(f, "ltr"),
165            Self::RightToLeft => write!(f, "rtl"),
166            Self::Auto => write!(f, "auto"),
167        }
168    }
169}
170
171impl TextDirection {
172    /// Parse a text direction from string value.
173    ///
174    /// # Arguments
175    ///
176    /// * `s` - Direction string ("ltr", "rtl", or "auto")
177    ///
178    /// # Returns
179    ///
180    /// `Some(TextDirection)` if valid, `None` otherwise.
181    ///
182    /// # Examples
183    ///
184    /// ```
185    /// # use html_to_markdown_rs::metadata::TextDirection;
186    /// assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
187    /// assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
188    /// assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
189    /// assert_eq!(TextDirection::parse("invalid"), None);
190    /// ```
191    #[must_use]
192    pub fn parse(s: &str) -> Option<Self> {
193        if s.eq_ignore_ascii_case("ltr") {
194            return Some(Self::LeftToRight);
195        }
196        if s.eq_ignore_ascii_case("rtl") {
197            return Some(Self::RightToLeft);
198        }
199        if s.eq_ignore_ascii_case("auto") {
200            return Some(Self::Auto);
201        }
202        None
203    }
204}
205
206/// Link classification based on href value and document context.
207///
208/// Used to categorize links during extraction for filtering and analysis.
209#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
210#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
211#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
212pub enum LinkType {
213    /// Anchor link within same document (href starts with #)
214    Anchor,
215    /// Internal link within same domain
216    Internal,
217    /// External link to different domain
218    External,
219    /// Email link (mailto:)
220    Email,
221    /// Phone link (tel:)
222    Phone,
223    /// Other protocol or unclassifiable
224    Other,
225}
226
227impl std::fmt::Display for LinkType {
228    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
229        match self {
230            Self::Anchor => write!(f, "anchor"),
231            Self::Internal => write!(f, "internal"),
232            Self::External => write!(f, "external"),
233            Self::Email => write!(f, "email"),
234            Self::Phone => write!(f, "phone"),
235            Self::Other => write!(f, "other"),
236        }
237    }
238}
239
240/// Image source classification for proper handling and processing.
241///
242/// Determines whether an image is embedded (data URI), inline SVG, external, or relative.
243#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
244#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
245#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
246pub enum ImageType {
247    /// Data URI embedded image (base64 or other encoding)
248    DataUri,
249    /// Inline SVG element
250    InlineSvg,
251    /// External image URL (http/https)
252    External,
253    /// Relative image path
254    Relative,
255}
256
257impl std::fmt::Display for ImageType {
258    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
259        match self {
260            Self::DataUri => write!(f, "data_uri"),
261            Self::InlineSvg => write!(f, "inline_svg"),
262            Self::External => write!(f, "external"),
263            Self::Relative => write!(f, "relative"),
264        }
265    }
266}
267
268/// Structured data format type.
269///
270/// Identifies the schema/format used for structured data markup.
271#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
272#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
273#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
274pub enum StructuredDataType {
275    /// JSON-LD (JSON for Linking Data) script blocks
276    #[cfg_attr(feature = "metadata", serde(rename = "json_ld"))]
277    JsonLd,
278    /// HTML5 Microdata attributes (itemscope, itemtype, itemprop)
279    Microdata,
280    /// RDF in Attributes (`RDFa`) markup
281    #[cfg_attr(feature = "metadata", serde(rename = "rdfa"))]
282    RDFa,
283}
284
285impl std::fmt::Display for StructuredDataType {
286    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
287        match self {
288            Self::JsonLd => write!(f, "json_ld"),
289            Self::Microdata => write!(f, "microdata"),
290            Self::RDFa => write!(f, "rdfa"),
291        }
292    }
293}
294
295/// Document-level metadata extracted from `<head>` and top-level elements.
296///
297/// Contains all metadata typically used by search engines, social media platforms,
298/// and browsers for document indexing and presentation.
299///
300/// # Examples
301///
302/// ```
303/// # use html_to_markdown_rs::metadata::DocumentMetadata;
304/// let doc = DocumentMetadata {
305///     title: Some("My Article".to_string()),
306///     description: Some("A great article about Rust".to_string()),
307///     keywords: vec!["rust".to_string(), "programming".to_string()],
308///     ..Default::default()
309/// };
310///
311/// assert_eq!(doc.title, Some("My Article".to_string()));
312/// ```
313#[derive(Debug, Clone, Default)]
314#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
315pub struct DocumentMetadata {
316    /// Document title from `<title>` tag
317    pub title: Option<String>,
318
319    /// Document description from `<meta name="description">` tag
320    pub description: Option<String>,
321
322    /// Document keywords from `<meta name="keywords">` tag, split on commas
323    pub keywords: Vec<String>,
324
325    /// Document author from `<meta name="author">` tag
326    pub author: Option<String>,
327
328    /// Canonical URL from `<link rel="canonical">` tag
329    pub canonical_url: Option<String>,
330
331    /// Base URL from `<base href="">` tag for resolving relative URLs
332    pub base_href: Option<String>,
333
334    /// Document language from `lang` attribute
335    pub language: Option<String>,
336
337    /// Document text direction from `dir` attribute
338    pub text_direction: Option<TextDirection>,
339
340    /// Open Graph metadata (og:* properties) for social media
341    /// Keys like "title", "description", "image", "url", etc.
342    pub open_graph: BTreeMap<String, String>,
343
344    /// Twitter Card metadata (twitter:* properties)
345    /// Keys like "card", "site", "creator", "title", "description", "image", etc.
346    pub twitter_card: BTreeMap<String, String>,
347
348    /// Additional meta tags not covered by specific fields
349    /// Keys are meta name/property attributes, values are content
350    pub meta_tags: BTreeMap<String, String>,
351}
352
353/// Header element metadata with hierarchy tracking.
354///
355/// Captures heading elements (h1-h6) with their text content, identifiers,
356/// and position in the document structure.
357///
358/// # Examples
359///
360/// ```
361/// # use html_to_markdown_rs::metadata::HeaderMetadata;
362/// let header = HeaderMetadata {
363///     level: 1,
364///     text: "Main Title".to_string(),
365///     id: Some("main-title".to_string()),
366///     depth: 0,
367///     html_offset: 145,
368/// };
369///
370/// assert_eq!(header.level, 1);
371/// assert!(header.is_valid());
372/// ```
373#[derive(Debug, Clone)]
374#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
375pub struct HeaderMetadata {
376    /// Header level: 1 (h1) through 6 (h6)
377    pub level: u8,
378
379    /// Normalized text content of the header
380    pub text: String,
381
382    /// HTML id attribute if present
383    pub id: Option<String>,
384
385    /// Document tree depth at the header element
386    pub depth: usize,
387
388    /// Byte offset in original HTML document
389    pub html_offset: usize,
390}
391
392impl HeaderMetadata {
393    /// Validate that the header level is within valid range (1-6).
394    ///
395    /// # Returns
396    ///
397    /// `true` if level is 1-6, `false` otherwise.
398    ///
399    /// # Examples
400    ///
401    /// ```
402    /// # use html_to_markdown_rs::metadata::HeaderMetadata;
403    /// let valid = HeaderMetadata {
404    ///     level: 3,
405    ///     text: "Title".to_string(),
406    ///     id: None,
407    ///     depth: 2,
408    ///     html_offset: 100,
409    /// };
410    /// assert!(valid.is_valid());
411    ///
412    /// let invalid = HeaderMetadata {
413    ///     level: 7,  // Invalid
414    ///     text: "Title".to_string(),
415    ///     id: None,
416    ///     depth: 2,
417    ///     html_offset: 100,
418    /// };
419    /// assert!(!invalid.is_valid());
420    /// ```
421    #[must_use]
422    pub const fn is_valid(&self) -> bool {
423        self.level >= 1 && self.level <= 6
424    }
425}
426
427/// Hyperlink metadata with categorization and attributes.
428///
429/// Represents `<a>` elements with parsed href values, text content, and link type classification.
430///
431/// # Examples
432///
433/// ```
434/// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
435/// let link = LinkMetadata {
436///     href: "https://example.com".to_string(),
437///     text: "Example".to_string(),
438///     title: Some("Visit Example".to_string()),
439///     link_type: LinkType::External,
440///     rel: vec!["nofollow".to_string()],
441///     attributes: Default::default(),
442/// };
443///
444/// assert_eq!(link.link_type, LinkType::External);
445/// assert_eq!(link.text, "Example");
446/// ```
447#[derive(Debug, Clone)]
448#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
449pub struct LinkMetadata {
450    /// The href URL value
451    pub href: String,
452
453    /// Link text content (normalized, concatenated if mixed with elements)
454    pub text: String,
455
456    /// Optional title attribute (often shown as tooltip)
457    pub title: Option<String>,
458
459    /// Link type classification
460    pub link_type: LinkType,
461
462    /// Rel attribute values (e.g., "nofollow", "stylesheet", "canonical")
463    pub rel: Vec<String>,
464
465    /// Additional HTML attributes
466    pub attributes: BTreeMap<String, String>,
467}
468
469impl LinkMetadata {
470    /// Classify a link based on href value.
471    ///
472    /// # Arguments
473    ///
474    /// * `href` - The href attribute value
475    ///
476    /// # Returns
477    ///
478    /// Appropriate [`LinkType`] based on protocol and content.
479    ///
480    /// # Examples
481    ///
482    /// ```
483    /// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
484    /// assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
485    /// assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
486    /// assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
487    /// assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
488    /// ```
489    #[must_use]
490    pub fn classify_link(href: &str) -> LinkType {
491        if href.starts_with('#') {
492            LinkType::Anchor
493        } else if href.starts_with("mailto:") {
494            LinkType::Email
495        } else if href.starts_with("tel:") {
496            LinkType::Phone
497        } else if href.starts_with("http://") || href.starts_with("https://") {
498            LinkType::External
499        } else if href.starts_with('/') || href.starts_with("../") || href.starts_with("./") {
500            LinkType::Internal
501        } else {
502            LinkType::Other
503        }
504    }
505}
506
507/// Image metadata with source and dimensions.
508///
509/// Captures `<img>` elements and inline `<svg>` elements with metadata
510/// for image analysis and optimization.
511///
512/// # Examples
513///
514/// ```
515/// # use html_to_markdown_rs::metadata::{ImageMetadata, ImageType};
516/// let img = ImageMetadata {
517///     src: "https://example.com/image.jpg".to_string(),
518///     alt: Some("An example image".to_string()),
519///     title: Some("Example".to_string()),
520///     dimensions: Some((800, 600)),
521///     image_type: ImageType::External,
522///     attributes: Default::default(),
523/// };
524///
525/// assert_eq!(img.image_type, ImageType::External);
526/// ```
527#[derive(Debug, Clone)]
528#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
529pub struct ImageMetadata {
530    /// Image source (URL, data URI, or SVG content identifier)
531    pub src: String,
532
533    /// Alternative text from alt attribute (for accessibility)
534    pub alt: Option<String>,
535
536    /// Title attribute (often shown as tooltip)
537    pub title: Option<String>,
538
539    /// Image dimensions as (width, height) if available
540    pub dimensions: Option<(u32, u32)>,
541
542    /// Image type classification
543    pub image_type: ImageType,
544
545    /// Additional HTML attributes
546    pub attributes: BTreeMap<String, String>,
547}
548
549/// Structured data block (JSON-LD, Microdata, or `RDFa`).
550///
551/// Represents machine-readable structured data found in the document.
552/// JSON-LD blocks are collected as raw JSON strings for flexibility.
553///
554/// # Examples
555///
556/// ```
557/// # use html_to_markdown_rs::metadata::{StructuredData, StructuredDataType};
558/// let schema = StructuredData {
559///     data_type: StructuredDataType::JsonLd,
560///     raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
561///     schema_type: Some("Article".to_string()),
562/// };
563///
564/// assert_eq!(schema.data_type, StructuredDataType::JsonLd);
565/// ```
566#[derive(Debug, Clone)]
567#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
568pub struct StructuredData {
569    /// Type of structured data (JSON-LD, Microdata, `RDFa`)
570    pub data_type: StructuredDataType,
571
572    /// Raw JSON string (for JSON-LD) or serialized representation
573    pub raw_json: String,
574
575    /// Schema type if detectable (e.g., "Article", "Event", "Product")
576    pub schema_type: Option<String>,
577}
578
579/// Default maximum size for structured data extraction (1 MB)
580pub const DEFAULT_MAX_STRUCTURED_DATA_SIZE: usize = 1_000_000;
581
582/// Configuration for metadata extraction granularity.
583///
584/// Controls which metadata types are extracted and size limits for safety.
585/// Enables selective extraction of different metadata categories from HTML documents,
586/// allowing fine-grained control over which types of information to collect during
587/// the HTML-to-Markdown conversion process.
588///
589/// # Fields
590///
591/// - `extract_document`: Enable document-level metadata extraction (title, description, author, Open Graph, Twitter Card, etc.)
592/// - `extract_headers`: Enable heading element extraction (h1-h6) with hierarchy tracking
593/// - `extract_links`: Enable anchor element extraction with link type classification
594/// - `extract_images`: Enable image element extraction with source and dimension metadata
595/// - `extract_structured_data`: Enable structured data extraction (JSON-LD, Microdata, `RDFa`)
596/// - `max_structured_data_size`: Safety limit on total structured data size in bytes
597///
598/// # Examples
599///
600/// ```
601/// # use html_to_markdown_rs::metadata::MetadataConfig;
602/// let config = MetadataConfig {
603///     extract_document: true,
604///     extract_headers: true,
605///     extract_links: true,
606///     extract_images: true,
607///     extract_structured_data: true,
608///     max_structured_data_size: 1_000_000,
609/// };
610///
611/// assert!(config.extract_headers);
612/// ```
613#[derive(Debug, Clone)]
614#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
615pub struct MetadataConfig {
616    /// Extract document-level metadata (title, description, author, etc.).
617    ///
618    /// When enabled, collects metadata from `<head>` section including:
619    /// - `<title>` element content
620    /// - `<meta name="description">` and other standard meta tags
621    /// - Open Graph (og:*) properties for social media optimization
622    /// - Twitter Card (twitter:*) properties
623    /// - Language and text direction attributes
624    /// - Canonical URL and base href references
625    pub extract_document: bool,
626
627    /// Extract h1-h6 header elements and their hierarchy.
628    ///
629    /// When enabled, collects all heading elements with:
630    /// - Header level (1-6)
631    /// - Text content (normalized)
632    /// - HTML id attribute if present
633    /// - Document tree depth for hierarchy tracking
634    /// - Byte offset in original HTML for positioning
635    pub extract_headers: bool,
636
637    /// Extract anchor (a) elements as links with type classification.
638    ///
639    /// When enabled, collects all hyperlinks with:
640    /// - href attribute value
641    /// - Link text content
642    /// - Title attribute (tooltip text)
643    /// - Automatic link type classification (anchor, internal, external, email, phone, other)
644    /// - Rel attribute values
645    /// - Additional custom attributes
646    pub extract_links: bool,
647
648    /// Extract image elements and data URIs.
649    ///
650    /// When enabled, collects all image elements with:
651    /// - Source URL or data URI
652    /// - Alt text for accessibility
653    /// - Title attribute
654    /// - Dimensions (width, height) if available
655    /// - Automatic image type classification (data URI, external, relative, inline SVG)
656    /// - Additional custom attributes
657    pub extract_images: bool,
658
659    /// Extract structured data (JSON-LD, Microdata, `RDFa`).
660    ///
661    /// When enabled, collects machine-readable structured data including:
662    /// - JSON-LD script blocks with schema detection
663    /// - Microdata attributes (itemscope, itemtype, itemprop)
664    /// - `RDFa` markup
665    /// - Extracted schema type if detectable
666    pub extract_structured_data: bool,
667
668    /// Maximum total size of structured data to collect (bytes).
669    ///
670    /// Prevents memory exhaustion attacks on malformed or adversarial documents
671    /// containing excessively large structured data blocks. When the accumulated
672    /// size of structured data exceeds this limit, further collection stops.
673    /// Default: `1_000_000` bytes (1 MB)
674    pub max_structured_data_size: usize,
675}
676
677/// Partial update for `MetadataConfig`.
678///
679/// This struct uses `Option<T>` to represent optional fields that can be selectively updated.
680/// Only specified fields (Some values) will override existing config; None values leave the
681/// corresponding fields unchanged when applied via [`MetadataConfig::apply_update`].
682///
683/// # Fields
684///
685/// - `extract_document`: Optional override for document-level metadata extraction
686/// - `extract_headers`: Optional override for heading element extraction
687/// - `extract_links`: Optional override for link element extraction
688/// - `extract_images`: Optional override for image element extraction
689/// - `extract_structured_data`: Optional override for structured data extraction
690/// - `max_structured_data_size`: Optional override for structured data size limit
691///
692/// # Examples
693///
694/// ```
695/// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
696/// let update = MetadataConfigUpdate {
697///     extract_document: Some(false),
698///     extract_headers: Some(true),
699///     extract_links: None,  // No change
700///     extract_images: None,  // No change
701///     extract_structured_data: None,  // No change
702///     max_structured_data_size: None,  // No change
703/// };
704///
705/// let mut config = MetadataConfig::default();
706/// config.apply_update(update);
707/// assert!(!config.extract_document);
708/// assert!(config.extract_headers);
709/// ```
710#[derive(Debug, Clone, Default)]
711#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
712#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
713pub struct MetadataConfigUpdate {
714    /// Optional override for extracting document-level metadata.
715    ///
716    /// When Some(true), enables document metadata extraction; Some(false) disables it.
717    /// None leaves the current setting unchanged.
718    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_document"))]
719    pub extract_document: Option<bool>,
720
721    /// Optional override for extracting heading elements (h1-h6).
722    ///
723    /// When Some(true), enables header extraction; Some(false) disables it.
724    /// None leaves the current setting unchanged.
725    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_headers"))]
726    pub extract_headers: Option<bool>,
727
728    /// Optional override for extracting anchor (link) elements.
729    ///
730    /// When Some(true), enables link extraction; Some(false) disables it.
731    /// None leaves the current setting unchanged.
732    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_links"))]
733    pub extract_links: Option<bool>,
734
735    /// Optional override for extracting image elements.
736    ///
737    /// When Some(true), enables image extraction; Some(false) disables it.
738    /// None leaves the current setting unchanged.
739    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_images"))]
740    pub extract_images: Option<bool>,
741
742    /// Optional override for extracting structured data (JSON-LD, Microdata, `RDFa`).
743    ///
744    /// When Some(true), enables structured data extraction; Some(false) disables it.
745    /// None leaves the current setting unchanged.
746    #[cfg_attr(
747        any(feature = "serde", feature = "metadata"),
748        serde(alias = "extract_structured_data")
749    )]
750    pub extract_structured_data: Option<bool>,
751
752    /// Optional override for maximum structured data collection size in bytes.
753    ///
754    /// When Some(size), sets the new size limit. None leaves the current limit unchanged.
755    /// Use this to adjust safety thresholds for different documents.
756    #[cfg_attr(
757        any(feature = "serde", feature = "metadata"),
758        serde(alias = "max_structured_data_size")
759    )]
760    pub max_structured_data_size: Option<usize>,
761}
762
763impl Default for MetadataConfig {
764    /// Create default metadata configuration.
765    ///
766    /// Defaults to extracting all metadata types with 1MB limit on structured data.
767    fn default() -> Self {
768        Self {
769            extract_document: true,
770            extract_headers: true,
771            extract_links: true,
772            extract_images: true,
773            extract_structured_data: true,
774            max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
775        }
776    }
777}
778
779impl MetadataConfig {
780    /// Check if any metadata extraction is enabled.
781    ///
782    /// Returns `true` if at least one extraction category is enabled, `false` if all are disabled.
783    /// This is useful for early exit optimization when the application doesn't need metadata.
784    ///
785    /// # Returns
786    ///
787    /// `true` if any of the extraction flags are enabled, `false` if all are disabled.
788    ///
789    /// # Examples
790    ///
791    /// ```
792    /// # use html_to_markdown_rs::metadata::MetadataConfig;
793    /// // All enabled
794    /// let config = MetadataConfig::default();
795    /// assert!(config.any_enabled());
796    ///
797    /// // Selectively enabled
798    /// let config = MetadataConfig {
799    ///     extract_headers: true,
800    ///     extract_document: false,
801    ///     extract_links: false,
802    ///     extract_images: false,
803    ///     extract_structured_data: false,
804    ///     max_structured_data_size: 1_000_000,
805    /// };
806    /// assert!(config.any_enabled());
807    ///
808    /// // All disabled
809    /// let config = MetadataConfig {
810    ///     extract_document: false,
811    ///     extract_headers: false,
812    ///     extract_links: false,
813    ///     extract_images: false,
814    ///     extract_structured_data: false,
815    ///     max_structured_data_size: 1_000_000,
816    /// };
817    /// assert!(!config.any_enabled());
818    /// ```
819    #[must_use]
820    pub const fn any_enabled(&self) -> bool {
821        self.extract_document
822            || self.extract_headers
823            || self.extract_links
824            || self.extract_images
825            || self.extract_structured_data
826    }
827
828    /// Apply a partial update to this metadata configuration.
829    ///
830    /// Any specified fields in the update (Some values) will override the current values.
831    /// Unspecified fields (None) are left unchanged. This allows selective modification
832    /// of configuration without affecting unrelated settings.
833    ///
834    /// # Arguments
835    ///
836    /// * `update` - Partial metadata config update with fields to override
837    ///
838    /// # Examples
839    ///
840    /// ```
841    /// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
842    /// let mut config = MetadataConfig::default();
843    /// // config starts with all extraction enabled
844    ///
845    /// let update = MetadataConfigUpdate {
846    ///     extract_document: Some(false),
847    ///     extract_images: Some(false),
848    ///     // All other fields are None, so they won't change
849    ///     ..Default::default()
850    /// };
851    ///
852    /// config.apply_update(update);
853    ///
854    /// assert!(!config.extract_document);
855    /// assert!(!config.extract_images);
856    /// assert!(config.extract_headers);  // Unchanged
857    /// assert!(config.extract_links);    // Unchanged
858    /// ```
859    pub const fn apply_update(&mut self, update: MetadataConfigUpdate) {
860        if let Some(extract_document) = update.extract_document {
861            self.extract_document = extract_document;
862        }
863        if let Some(extract_headers) = update.extract_headers {
864            self.extract_headers = extract_headers;
865        }
866        if let Some(extract_links) = update.extract_links {
867            self.extract_links = extract_links;
868        }
869        if let Some(extract_images) = update.extract_images {
870            self.extract_images = extract_images;
871        }
872        if let Some(extract_structured_data) = update.extract_structured_data {
873            self.extract_structured_data = extract_structured_data;
874        }
875        if let Some(max_structured_data_size) = update.max_structured_data_size {
876            self.max_structured_data_size = max_structured_data_size;
877        }
878    }
879
880    /// Create new metadata configuration from a partial update.
881    ///
882    /// Creates a new `MetadataConfig` struct with defaults, then applies the update.
883    /// Fields not specified in the update (None) keep their default values.
884    /// This is a convenience method for constructing a configuration from a partial specification
885    /// without needing to explicitly call `.default()` first.
886    ///
887    /// # Arguments
888    ///
889    /// * `update` - Partial metadata config update with fields to set
890    ///
891    /// # Returns
892    ///
893    /// New `MetadataConfig` with specified updates applied to defaults
894    ///
895    /// # Examples
896    ///
897    /// ```
898    /// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
899    /// let update = MetadataConfigUpdate {
900    ///     extract_document: Some(false),
901    ///     extract_headers: Some(true),
902    ///     extract_links: Some(true),
903    ///     extract_images: None,  // Will use default (true)
904    ///     extract_structured_data: None,  // Will use default (true)
905    ///     max_structured_data_size: None,  // Will use default (1MB)
906    /// };
907    ///
908    /// let config = MetadataConfig::from_update(update);
909    ///
910    /// assert!(!config.extract_document);
911    /// assert!(config.extract_headers);
912    /// assert!(config.extract_links);
913    /// assert!(config.extract_images);  // Default
914    /// assert!(config.extract_structured_data);  // Default
915    /// ```
916    #[must_use]
917    pub fn from_update(update: MetadataConfigUpdate) -> Self {
918        let mut config = Self::default();
919        config.apply_update(update);
920        config
921    }
922}
923
924impl From<MetadataConfigUpdate> for MetadataConfig {
925    fn from(update: MetadataConfigUpdate) -> Self {
926        Self::from_update(update)
927    }
928}
929
930/// Comprehensive metadata extraction result from HTML document.
931///
932/// Contains all extracted metadata types in a single structure,
933/// suitable for serialization and transmission across language boundaries.
934///
935/// # Examples
936///
937/// ```
938/// # use html_to_markdown_rs::metadata::ExtendedMetadata;
939/// let metadata = ExtendedMetadata {
940///     document: Default::default(),
941///     headers: Vec::new(),
942///     links: Vec::new(),
943///     images: Vec::new(),
944///     structured_data: Vec::new(),
945/// };
946///
947/// assert!(metadata.headers.is_empty());
948/// ```
949#[derive(Debug, Clone, Default)]
950#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
951pub struct ExtendedMetadata {
952    /// Document-level metadata (title, description, canonical, etc.)
953    pub document: DocumentMetadata,
954
955    /// Extracted header elements with hierarchy
956    pub headers: Vec<HeaderMetadata>,
957
958    /// Extracted hyperlinks with type classification
959    pub links: Vec<LinkMetadata>,
960
961    /// Extracted images with source and dimensions
962    pub images: Vec<ImageMetadata>,
963
964    /// Extracted structured data blocks
965    pub structured_data: Vec<StructuredData>,
966}
967
968/// Internal metadata collector for single-pass extraction.
969///
970/// Follows the [`InlineImageCollector`](crate::inline_images::InlineImageCollector) pattern
971/// for efficient metadata extraction during tree traversal. Maintains state for:
972/// - Document metadata from head elements
973/// - Header hierarchy tracking
974/// - Link accumulation
975/// - Structured data collection
976/// - Language and directionality attributes
977///
978/// # Architecture
979///
980/// The collector is designed to be:
981/// - **Performant**: Pre-allocated collections, minimal cloning
982/// - **Single-pass**: Collects during main tree walk without separate passes
983/// - **Optional**: Zero overhead when disabled via feature flags
984/// - **Type-safe**: Strict separation of collection and result types
985///
986/// # Internal State
987///
988/// - `head_metadata`: Raw metadata pairs from head element
989/// - `headers`: Collected header elements
990/// - `header_stack`: For tracking nesting depth
991/// - `links`: Collected link elements
992/// - `base_href`: Base URL for relative link resolution
993/// - `json_ld`: JSON-LD script block contents
994/// - `lang`: Document language
995/// - `dir`: Document text direction
996#[derive(Debug)]
997#[allow(dead_code)]
998pub(crate) struct MetadataCollector {
999    head_metadata: BTreeMap<String, String>,
1000    headers: Vec<HeaderMetadata>,
1001    header_stack: Vec<usize>,
1002    links: Vec<LinkMetadata>,
1003    images: Vec<ImageMetadata>,
1004    json_ld: Vec<String>,
1005    structured_data_size: usize,
1006    config: MetadataConfig,
1007    lang: Option<String>,
1008    dir: Option<String>,
1009}
1010
1011#[allow(dead_code)]
1012impl MetadataCollector {
1013    /// Create a new metadata collector with configuration.
1014    ///
1015    /// Pre-allocates collections based on typical document sizes
1016    /// for efficient append operations during traversal.
1017    ///
1018    /// # Arguments
1019    ///
1020    /// * `config` - Extraction configuration specifying which types to collect
1021    ///
1022    /// # Returns
1023    ///
1024    /// A new collector ready for use during tree traversal.
1025    ///
1026    /// # Examples
1027    ///
1028    /// ```ignore
1029    /// let config = MetadataConfig::default();
1030    /// let collector = MetadataCollector::new(config);
1031    /// ```
1032    pub(crate) fn new(config: MetadataConfig) -> Self {
1033        Self {
1034            head_metadata: BTreeMap::new(),
1035            headers: Vec::with_capacity(32),
1036            header_stack: Vec::with_capacity(6),
1037            links: Vec::with_capacity(64),
1038            images: Vec::with_capacity(16),
1039            json_ld: Vec::with_capacity(4),
1040            structured_data_size: 0,
1041            config,
1042            lang: None,
1043            dir: None,
1044        }
1045    }
1046
1047    /// Add a header element to the collection.
1048    ///
1049    /// Validates that level is in range 1-6 and tracks hierarchy via depth.
1050    ///
1051    /// # Arguments
1052    ///
1053    /// * `level` - Header level (1-6)
1054    /// * `text` - Normalized header text content
1055    /// * `id` - Optional HTML id attribute
1056    /// * `depth` - Current document nesting depth
1057    /// * `html_offset` - Byte offset in original HTML
1058    pub(crate) fn add_header(&mut self, level: u8, text: String, id: Option<String>, depth: usize, html_offset: usize) {
1059        if !self.config.extract_headers {
1060            return;
1061        }
1062
1063        if !(1..=6).contains(&level) {
1064            return;
1065        }
1066
1067        let header = HeaderMetadata {
1068            level,
1069            text,
1070            id,
1071            depth,
1072            html_offset,
1073        };
1074
1075        self.headers.push(header);
1076    }
1077
1078    /// Add a link element to the collection.
1079    ///
1080    /// Classifies the link based on href value and stores with metadata.
1081    ///
1082    /// # Arguments
1083    ///
1084    /// * `href` - The href attribute value
1085    /// * `text` - Link text content
1086    /// * `title` - Optional title attribute
1087    /// * `rel` - Comma/space-separated rel attribute value
1088    /// * `attributes` - Additional attributes to capture (e.g., data-* or aria-* values)
1089    pub(crate) fn add_link(
1090        &mut self,
1091        href: String,
1092        text: String,
1093        title: Option<String>,
1094        rel: Option<String>,
1095        attributes: BTreeMap<String, String>,
1096    ) {
1097        if !self.config.extract_links {
1098            return;
1099        }
1100
1101        let link_type = LinkMetadata::classify_link(&href);
1102
1103        let rel_vec = rel
1104            .map(|r| {
1105                r.split_whitespace()
1106                    .map(std::string::ToString::to_string)
1107                    .collect::<Vec<_>>()
1108            })
1109            .unwrap_or_default();
1110
1111        let link = LinkMetadata {
1112            href,
1113            text,
1114            title,
1115            link_type,
1116            rel: rel_vec,
1117            attributes,
1118        };
1119
1120        self.links.push(link);
1121    }
1122
1123    /// Add an image element to the collection.
1124    ///
1125    /// # Arguments
1126    ///
1127    /// * `src` - Image source (URL or data URI)
1128    /// * `alt` - Optional alt text
1129    /// * `title` - Optional title attribute
1130    /// * `dimensions` - Optional (width, height) tuple
1131    pub(crate) fn add_image(
1132        &mut self,
1133        src: String,
1134        alt: Option<String>,
1135        title: Option<String>,
1136        dimensions: Option<(u32, u32)>,
1137        attributes: BTreeMap<String, String>,
1138    ) {
1139        if !self.config.extract_images {
1140            return;
1141        }
1142
1143        let image_type = if src.starts_with("data:") {
1144            ImageType::DataUri
1145        } else if src.starts_with("http://") || src.starts_with("https://") {
1146            ImageType::External
1147        } else if src.starts_with('<') && src.contains("svg") {
1148            ImageType::InlineSvg
1149        } else {
1150            ImageType::Relative
1151        };
1152
1153        let image = ImageMetadata {
1154            src,
1155            alt,
1156            title,
1157            dimensions,
1158            image_type,
1159            attributes,
1160        };
1161
1162        self.images.push(image);
1163    }
1164
1165    /// Add a JSON-LD structured data block.
1166    ///
1167    /// Accumulates JSON content with size validation against configured limits.
1168    ///
1169    /// # Arguments
1170    ///
1171    /// * `json_content` - Raw JSON string content
1172    pub(crate) fn add_json_ld(&mut self, json_content: String) {
1173        if !self.config.extract_structured_data {
1174            return;
1175        }
1176
1177        let content_size = json_content.len();
1178        if content_size > self.config.max_structured_data_size {
1179            return;
1180        }
1181        if self.structured_data_size + content_size > self.config.max_structured_data_size {
1182            return;
1183        }
1184
1185        self.structured_data_size += content_size;
1186        self.json_ld.push(json_content);
1187    }
1188
1189    /// Set document head metadata from extracted head section.
1190    ///
1191    /// Merges metadata pairs from head elements (meta, title, link, etc.)
1192    /// into the collector's head metadata store.
1193    ///
1194    /// # Arguments
1195    ///
1196    /// * `metadata` - `BTreeMap` of metadata key-value pairs
1197    pub(crate) fn set_head_metadata(&mut self, metadata: BTreeMap<String, String>) {
1198        if !self.config.extract_document {
1199            return;
1200        }
1201        self.head_metadata.extend(metadata);
1202    }
1203
1204    /// Set document language attribute.
1205    ///
1206    /// Usually from `lang` attribute on `<html>` or `<body>` tag.
1207    /// Only sets if not already set (first occurrence wins).
1208    ///
1209    /// # Arguments
1210    ///
1211    /// * `lang` - Language code (e.g., "en", "es", "fr")
1212    pub(crate) fn set_language(&mut self, lang: String) {
1213        if !self.config.extract_document {
1214            return;
1215        }
1216        if self.lang.is_none() {
1217            self.lang = Some(lang);
1218        }
1219    }
1220
1221    /// Set document text direction attribute.
1222    ///
1223    /// Usually from `dir` attribute on `<html>` or `<body>` tag.
1224    /// Only sets if not already set (first occurrence wins).
1225    ///
1226    /// # Arguments
1227    ///
1228    /// * `dir` - Direction string ("ltr", "rtl", or "auto")
1229    pub(crate) fn set_text_direction(&mut self, dir: String) {
1230        if !self.config.extract_document {
1231            return;
1232        }
1233        if self.dir.is_none() {
1234            self.dir = Some(dir);
1235        }
1236    }
1237
1238    pub(crate) const fn wants_document(&self) -> bool {
1239        self.config.extract_document
1240    }
1241
1242    pub(crate) const fn wants_headers(&self) -> bool {
1243        self.config.extract_headers
1244    }
1245
1246    pub(crate) const fn wants_links(&self) -> bool {
1247        self.config.extract_links
1248    }
1249
1250    pub(crate) const fn wants_images(&self) -> bool {
1251        self.config.extract_images
1252    }
1253
1254    pub(crate) const fn wants_structured_data(&self) -> bool {
1255        self.config.extract_structured_data
1256    }
1257
1258    /// Extract document metadata from collected head metadata.
1259    ///
1260    /// Parses head metadata into structured document metadata,
1261    /// handling special cases like Open Graph, Twitter Card, keywords, etc.
1262    #[allow(dead_code)]
1263    fn extract_document_metadata(
1264        head_metadata: BTreeMap<String, String>,
1265        lang: Option<String>,
1266        dir: Option<String>,
1267    ) -> DocumentMetadata {
1268        let mut doc = DocumentMetadata::default();
1269
1270        for (raw_key, value) in head_metadata {
1271            let mut key = raw_key.as_str();
1272            let mut replaced_key: Option<String> = None;
1273
1274            if let Some(stripped) = key.strip_prefix("meta-") {
1275                key = stripped;
1276            }
1277
1278            if key.as_bytes().contains(&b':') {
1279                replaced_key = Some(key.replace(':', "-"));
1280                key = replaced_key.as_deref().unwrap_or(key);
1281            }
1282
1283            match key {
1284                "title" => doc.title = Some(value),
1285                "description" => doc.description = Some(value),
1286                "author" => doc.author = Some(value),
1287                "canonical" => doc.canonical_url = Some(value),
1288                "base" | "base-href" => doc.base_href = Some(value),
1289                key if key.starts_with("og-") => {
1290                    let og_key = if key.as_bytes().contains(&b'-') {
1291                        key.trim_start_matches("og-").replace('-', "_")
1292                    } else {
1293                        key.trim_start_matches("og-").to_string()
1294                    };
1295                    doc.open_graph.insert(og_key, value);
1296                }
1297                key if key.starts_with("twitter-") => {
1298                    let tw_key = if key.as_bytes().contains(&b'-') {
1299                        key.trim_start_matches("twitter-").replace('-', "_")
1300                    } else {
1301                        key.trim_start_matches("twitter-").to_string()
1302                    };
1303                    doc.twitter_card.insert(tw_key, value);
1304                }
1305                "keywords" => {
1306                    doc.keywords = value
1307                        .split(',')
1308                        .map(|s| s.trim().to_string())
1309                        .filter(|s| !s.is_empty())
1310                        .collect();
1311                }
1312                _ => {
1313                    let meta_key = if key.as_ptr() == raw_key.as_ptr() && key.len() == raw_key.len() {
1314                        raw_key
1315                    } else if let Some(replaced) = replaced_key {
1316                        replaced
1317                    } else {
1318                        key.to_string()
1319                    };
1320                    doc.meta_tags.insert(meta_key, value);
1321                }
1322            }
1323        }
1324
1325        if let Some(lang) = lang {
1326            doc.language = Some(lang);
1327        }
1328
1329        if let Some(dir) = dir {
1330            if let Some(parsed_dir) = TextDirection::parse(&dir) {
1331                doc.text_direction = Some(parsed_dir);
1332            }
1333        }
1334
1335        doc
1336    }
1337
1338    /// Extract structured data blocks into `StructuredData` items.
1339    #[allow(dead_code)]
1340    fn extract_structured_data(json_ld: Vec<String>) -> Vec<StructuredData> {
1341        let mut result = Vec::with_capacity(json_ld.len());
1342
1343        for json_str in json_ld {
1344            let schema_type = Self::scan_schema_type(&json_str)
1345                .or_else(|| {
1346                    if json_str.contains("\"@type\"") {
1347                        serde_json::from_str::<serde_json::Value>(&json_str).ok().and_then(|v| {
1348                            v.get("@type")
1349                                .and_then(|t| t.as_str().map(std::string::ToString::to_string))
1350                        })
1351                    } else {
1352                        None
1353                    }
1354                })
1355                .or_else(|| {
1356                    if !json_str.contains("\"@graph\"") {
1357                        return None;
1358                    }
1359
1360                    let value = serde_json::from_str::<serde_json::Value>(&json_str).ok()?;
1361                    let graph = value.get("@graph")?;
1362                    let items = graph.as_array()?;
1363                    items.iter().find_map(|item| {
1364                        item.get("@type")
1365                            .and_then(|t| t.as_str().map(std::string::ToString::to_string))
1366                    })
1367                });
1368
1369            result.push(StructuredData {
1370                data_type: StructuredDataType::JsonLd,
1371                raw_json: json_str,
1372                schema_type,
1373            });
1374        }
1375
1376        result
1377    }
1378
1379    fn scan_schema_type(json_str: &str) -> Option<String> {
1380        let needle = "\"@type\"";
1381        let start = json_str.find(needle)? + needle.len();
1382        let bytes = json_str.as_bytes();
1383        let mut i = start;
1384
1385        while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1386            i += 1;
1387        }
1388        if i >= bytes.len() || bytes[i] != b':' {
1389            return None;
1390        }
1391        i += 1;
1392        while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1393            i += 1;
1394        }
1395        if i >= bytes.len() {
1396            return None;
1397        }
1398
1399        if bytes[i] == b'[' {
1400            i += 1;
1401            while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1402                i += 1;
1403            }
1404            if i >= bytes.len() || bytes[i] != b'"' {
1405                return None;
1406            }
1407        } else if bytes[i] != b'"' {
1408            return None;
1409        }
1410
1411        let start_quote = i;
1412        i += 1;
1413        let mut escaped = false;
1414        while i < bytes.len() {
1415            let byte = bytes[i];
1416            if escaped {
1417                escaped = false;
1418                i += 1;
1419                continue;
1420            }
1421            if byte == b'\\' {
1422                escaped = true;
1423                i += 1;
1424                continue;
1425            }
1426            if byte == b'"' {
1427                let end_quote = i;
1428                let slice = &json_str[start_quote..=end_quote];
1429                return serde_json::from_str::<String>(slice).ok();
1430            }
1431            i += 1;
1432        }
1433
1434        None
1435    }
1436
1437    /// Finish collection and return all extracted metadata.
1438    ///
1439    /// Performs final processing, validation, and consolidation of all
1440    /// collected data into the [`ExtendedMetadata`] output structure.
1441    ///
1442    /// # Returns
1443    ///
1444    /// Complete [`ExtendedMetadata`] with all extracted information.
1445    #[allow(dead_code)]
1446    pub(crate) fn finish(self) -> ExtendedMetadata {
1447        let structured_data = Self::extract_structured_data(self.json_ld);
1448        let document = Self::extract_document_metadata(self.head_metadata, self.lang, self.dir);
1449
1450        ExtendedMetadata {
1451            document,
1452            headers: self.headers,
1453            links: self.links,
1454            images: self.images,
1455            structured_data,
1456        }
1457    }
1458
1459    /// Categorize links by type for analysis and filtering.
1460    ///
1461    /// Separates collected links into groups by [`LinkType`].
1462    /// This is an analysis helper method; actual categorization happens during `add_link`.
1463    ///
1464    /// # Returns
1465    ///
1466    /// `BTreeMap` with `LinkType` as key and Vec of matching `LinkMetadata` as value.
1467    #[allow(dead_code)]
1468    pub(crate) fn categorize_links(&self) -> BTreeMap<String, Vec<&LinkMetadata>> {
1469        let mut categorized: BTreeMap<String, Vec<&LinkMetadata>> = BTreeMap::new();
1470
1471        for link in &self.links {
1472            let category = link.link_type.to_string();
1473            categorized.entry(category).or_default().push(link);
1474        }
1475
1476        categorized
1477    }
1478
1479    /// Count headers by level for structural analysis.
1480    ///
1481    /// Returns count of headers at each level (1-6).
1482    ///
1483    /// # Returns
1484    ///
1485    /// `BTreeMap` with level as string key and count as value.
1486    #[allow(dead_code)]
1487    pub(crate) fn header_counts(&self) -> BTreeMap<String, usize> {
1488        let mut counts: BTreeMap<String, usize> = BTreeMap::new();
1489
1490        for header in &self.headers {
1491            *counts.entry(header.level.to_string()).or_insert(0) += 1;
1492        }
1493
1494        counts
1495    }
1496}
1497
1498/// Handle to a metadata collector via reference-counted mutable cell.
1499///
1500/// Used internally for sharing collector state across the tree traversal.
1501/// Matches the pattern used for [`InlineImageCollector`](crate::inline_images::InlineImageCollector).
1502///
1503/// # Examples
1504///
1505/// ```ignore
1506/// let collector = MetadataCollector::new(MetadataConfig::default());
1507/// let handle = Rc::new(RefCell::new(collector));
1508///
1509/// // In tree walk, can be passed and borrowed
1510/// handle.borrow_mut().add_header(1, "Title".to_string(), None, 0, 100);
1511///
1512/// let metadata = handle.take().finish();
1513/// ```
1514#[allow(dead_code)]
1515pub(crate) type MetadataCollectorHandle = Rc<RefCell<MetadataCollector>>;
1516
1517#[cfg(test)]
1518mod tests {
1519    use super::*;
1520
1521    #[test]
1522    fn test_text_direction_parse() {
1523        assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
1524        assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
1525        assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
1526        assert_eq!(TextDirection::parse("invalid"), None);
1527        assert_eq!(TextDirection::parse("LTR"), Some(TextDirection::LeftToRight));
1528    }
1529
1530    #[test]
1531    fn test_text_direction_display() {
1532        assert_eq!(TextDirection::LeftToRight.to_string(), "ltr");
1533        assert_eq!(TextDirection::RightToLeft.to_string(), "rtl");
1534        assert_eq!(TextDirection::Auto.to_string(), "auto");
1535    }
1536
1537    #[test]
1538    fn test_link_classification() {
1539        assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
1540        assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
1541        assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
1542        assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
1543        assert_eq!(LinkMetadata::classify_link("http://example.com"), LinkType::External);
1544        assert_eq!(LinkMetadata::classify_link("/path/to/page"), LinkType::Internal);
1545        assert_eq!(LinkMetadata::classify_link("../relative"), LinkType::Internal);
1546        assert_eq!(LinkMetadata::classify_link("./same"), LinkType::Internal);
1547    }
1548
1549    #[test]
1550    fn test_header_validation() {
1551        let valid = HeaderMetadata {
1552            level: 3,
1553            text: "Title".to_string(),
1554            id: None,
1555            depth: 2,
1556            html_offset: 100,
1557        };
1558        assert!(valid.is_valid());
1559
1560        let invalid_high = HeaderMetadata {
1561            level: 7,
1562            text: "Title".to_string(),
1563            id: None,
1564            depth: 2,
1565            html_offset: 100,
1566        };
1567        assert!(!invalid_high.is_valid());
1568
1569        let invalid_low = HeaderMetadata {
1570            level: 0,
1571            text: "Title".to_string(),
1572            id: None,
1573            depth: 2,
1574            html_offset: 100,
1575        };
1576        assert!(!invalid_low.is_valid());
1577    }
1578
1579    #[test]
1580    fn test_metadata_collector_new() {
1581        let config = MetadataConfig::default();
1582        let collector = MetadataCollector::new(config);
1583
1584        assert_eq!(collector.headers.capacity(), 32);
1585        assert_eq!(collector.links.capacity(), 64);
1586        assert_eq!(collector.images.capacity(), 16);
1587        assert_eq!(collector.json_ld.capacity(), 4);
1588    }
1589
1590    #[test]
1591    fn test_metadata_collector_add_header() {
1592        let config = MetadataConfig::default();
1593        let mut collector = MetadataCollector::new(config);
1594
1595        collector.add_header(1, "Title".to_string(), Some("title".to_string()), 0, 100);
1596        assert_eq!(collector.headers.len(), 1);
1597
1598        let header = &collector.headers[0];
1599        assert_eq!(header.level, 1);
1600        assert_eq!(header.text, "Title");
1601        assert_eq!(header.id, Some("title".to_string()));
1602
1603        collector.add_header(7, "Invalid".to_string(), None, 0, 200);
1604        assert_eq!(collector.headers.len(), 1);
1605    }
1606
1607    #[test]
1608    fn test_metadata_collector_add_link() {
1609        let config = MetadataConfig::default();
1610        let mut collector = MetadataCollector::new(config);
1611
1612        collector.add_link(
1613            "https://example.com".to_string(),
1614            "Example".to_string(),
1615            Some("Visit".to_string()),
1616            Some("nofollow external".to_string()),
1617            BTreeMap::from([("data-id".to_string(), "example".to_string())]),
1618        );
1619
1620        assert_eq!(collector.links.len(), 1);
1621
1622        let link = &collector.links[0];
1623        assert_eq!(link.href, "https://example.com");
1624        assert_eq!(link.text, "Example");
1625        assert_eq!(link.link_type, LinkType::External);
1626        assert_eq!(link.rel, vec!["nofollow", "external"]);
1627        assert_eq!(link.attributes.get("data-id"), Some(&"example".to_string()));
1628    }
1629
1630    #[test]
1631    fn test_metadata_collector_respects_config() {
1632        let config = MetadataConfig {
1633            extract_document: false,
1634            extract_headers: false,
1635            extract_links: false,
1636            extract_images: false,
1637            extract_structured_data: false,
1638            max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
1639        };
1640        let mut collector = MetadataCollector::new(config);
1641
1642        collector.add_header(1, "Title".to_string(), None, 0, 100);
1643        collector.add_link(
1644            "https://example.com".to_string(),
1645            "Link".to_string(),
1646            None,
1647            None,
1648            BTreeMap::new(),
1649        );
1650        collector.add_image(
1651            "https://example.com/img.jpg".to_string(),
1652            None,
1653            None,
1654            None,
1655            BTreeMap::new(),
1656        );
1657        collector.add_json_ld("{}".to_string());
1658
1659        assert!(collector.headers.is_empty());
1660        assert!(collector.links.is_empty());
1661        assert!(collector.images.is_empty());
1662        assert!(collector.json_ld.is_empty());
1663    }
1664
1665    #[test]
1666    fn test_metadata_collector_finish() {
1667        let config = MetadataConfig::default();
1668        let mut collector = MetadataCollector::new(config);
1669
1670        collector.set_language("en".to_string());
1671        collector.add_header(1, "Main Title".to_string(), None, 0, 100);
1672        collector.add_link(
1673            "https://example.com".to_string(),
1674            "Example".to_string(),
1675            None,
1676            None,
1677            BTreeMap::new(),
1678        );
1679
1680        let metadata = collector.finish();
1681
1682        assert_eq!(metadata.document.language, Some("en".to_string()));
1683        assert_eq!(metadata.headers.len(), 1);
1684        assert_eq!(metadata.links.len(), 1);
1685    }
1686
1687    #[test]
1688    fn test_document_metadata_default() {
1689        let doc = DocumentMetadata::default();
1690
1691        assert!(doc.title.is_none());
1692        assert!(doc.description.is_none());
1693        assert!(doc.keywords.is_empty());
1694        assert!(doc.open_graph.is_empty());
1695        assert!(doc.twitter_card.is_empty());
1696        assert!(doc.meta_tags.is_empty());
1697    }
1698
1699    #[test]
1700    fn test_metadata_config_default() {
1701        let config = MetadataConfig::default();
1702
1703        assert!(config.extract_headers);
1704        assert!(config.extract_links);
1705        assert!(config.extract_images);
1706        assert!(config.extract_structured_data);
1707        assert_eq!(config.max_structured_data_size, DEFAULT_MAX_STRUCTURED_DATA_SIZE);
1708    }
1709
1710    #[test]
1711    fn test_image_type_classification() {
1712        let data_uri = ImageMetadata {
1713            src: "data:image/png;base64,iVBORw0KG...".to_string(),
1714            alt: None,
1715            title: None,
1716            dimensions: None,
1717            image_type: ImageType::DataUri,
1718            attributes: BTreeMap::new(),
1719        };
1720        assert_eq!(data_uri.image_type, ImageType::DataUri);
1721
1722        let external = ImageMetadata {
1723            src: "https://example.com/image.jpg".to_string(),
1724            alt: None,
1725            title: None,
1726            dimensions: None,
1727            image_type: ImageType::External,
1728            attributes: BTreeMap::new(),
1729        };
1730        assert_eq!(external.image_type, ImageType::External);
1731    }
1732
1733    #[test]
1734    fn test_link_type_display() {
1735        assert_eq!(LinkType::Anchor.to_string(), "anchor");
1736        assert_eq!(LinkType::Internal.to_string(), "internal");
1737        assert_eq!(LinkType::External.to_string(), "external");
1738        assert_eq!(LinkType::Email.to_string(), "email");
1739        assert_eq!(LinkType::Phone.to_string(), "phone");
1740        assert_eq!(LinkType::Other.to_string(), "other");
1741    }
1742
1743    #[test]
1744    fn test_structured_data_type_display() {
1745        assert_eq!(StructuredDataType::JsonLd.to_string(), "json_ld");
1746        assert_eq!(StructuredDataType::Microdata.to_string(), "microdata");
1747        assert_eq!(StructuredDataType::RDFa.to_string(), "rdfa");
1748    }
1749
1750    #[test]
1751    fn test_categorize_links() {
1752        let config = MetadataConfig::default();
1753        let mut collector = MetadataCollector::new(config);
1754
1755        collector.add_link("#anchor".to_string(), "Anchor".to_string(), None, None, BTreeMap::new());
1756        collector.add_link(
1757            "https://example.com".to_string(),
1758            "External".to_string(),
1759            None,
1760            None,
1761            BTreeMap::new(),
1762        );
1763        collector.add_link(
1764            "mailto:test@example.com".to_string(),
1765            "Email".to_string(),
1766            None,
1767            None,
1768            BTreeMap::new(),
1769        );
1770
1771        let categorized = collector.categorize_links();
1772
1773        assert_eq!(categorized.get("anchor").map(|v| v.len()), Some(1));
1774        assert_eq!(categorized.get("external").map(|v| v.len()), Some(1));
1775        assert_eq!(categorized.get("email").map(|v| v.len()), Some(1));
1776    }
1777
1778    #[test]
1779    fn test_header_counts() {
1780        let config = MetadataConfig::default();
1781        let mut collector = MetadataCollector::new(config);
1782
1783        collector.add_header(1, "H1".to_string(), None, 0, 100);
1784        collector.add_header(2, "H2".to_string(), None, 1, 200);
1785        collector.add_header(2, "H2b".to_string(), None, 1, 300);
1786        collector.add_header(3, "H3".to_string(), None, 2, 400);
1787
1788        let counts = collector.header_counts();
1789
1790        assert_eq!(counts.get("1").copied(), Some(1));
1791        assert_eq!(counts.get("2").copied(), Some(2));
1792        assert_eq!(counts.get("3").copied(), Some(1));
1793    }
1794}
html_to_markdown_rs/metadata.rs

html_to_markdown_rs/
metadata.rs