html_to_markdown_rs/
metadata.rs

1//! Metadata extraction for HTML to Markdown conversion.
2//!
3//! This module provides comprehensive, type-safe metadata extraction during HTML-to-Markdown
4//! conversion, enabling content analysis, SEO optimization, and document indexing workflows.
5//! Metadata includes:
6//! - **Document metadata**: Title, description, author, language, canonical URL, Open Graph, Twitter Card
7//! - **Headers**: Heading elements (h1-h6) with hierarchy, IDs, and positions
8//! - **Links**: Hyperlinks with type classification (anchor, internal, external, email, phone)
9//! - **Images**: Image elements with source, alt text, dimensions, and type (data URI, external, etc.)
10//! - **Structured data**: JSON-LD, Microdata, and RDFa blocks
11//!
12//! The implementation follows a single-pass collector pattern for zero-overhead extraction
13//! when metadata features are disabled.
14//!
15//! # Architecture
16//!
17//! Metadata extraction uses the [`MetadataCollector`] pattern (similar to [`InlineImageCollector`]):
18//! - **Single-pass collection**: Metadata is gathered during the primary tree traversal without additional passes
19//! - **Zero overhead when disabled**: Entire module can be compiled out via feature flags
20//! - **Configurable granularity**: Use [`MetadataConfig`] to select which metadata types to extract
21//! - **Type-safe APIs**: All metadata types are enum-based with exhaustive matching
22//! - **Memory-bounded**: Size limits prevent memory exhaustion from adversarial documents
23//! - **Pre-allocated buffers**: Typical documents (32 headers, 64 links, 16 images) handled efficiently
24//!
25//! # Type Overview
26//!
27//! ## Enumerations
28//!
29//! - [`TextDirection`]: Document directionality (LTR, RTL, Auto)
30//! - [`LinkType`]: Link classification (Anchor, Internal, External, Email, Phone, Other)
31//! - [`ImageType`]: Image source type (DataUri, External, Relative, InlineSvg)
32//! - [`StructuredDataType`]: Structured data format (JsonLd, Microdata, RDFa)
33//!
34//! ## Structures
35//!
36//! - [`DocumentMetadata`]: Head-level metadata with maps for Open Graph and Twitter Card
37//! - [`HeaderMetadata`]: Heading element with level (1-6), text, ID, hierarchy depth, and position
38//! - [`LinkMetadata`]: Hyperlink with href, text, title, type, rel attributes, and custom attributes
39//! - [`ImageMetadata`]: Image element with src, alt, title, dimensions, type, and attributes
40//! - [`StructuredData`]: Structured data block with type and raw JSON
41//! - [`MetadataConfig`]: Configuration controlling extraction granularity and size limits
42//! - [`ExtendedMetadata`]: Top-level result containing all extracted metadata
43//!
44//! # Examples
45//!
46//! ## Basic Usage with convert_with_metadata
47//!
48//! ```ignore
49//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
50//!
51//! let html = r#"
52//!   <html lang="en">
53//!     <head>
54//!       <title>My Article</title>
55//!       <meta name="description" content="An interesting read">
56//!     </head>
57//!     <body>
58//!       <h1 id="main">Title</h1>
59//!       <a href="https://example.com">External Link</a>
60//!       <img src="photo.jpg" alt="A photo">
61//!     </body>
62//!   </html>
63//! "#;
64//!
65//! let config = MetadataConfig::default();
66//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
67//!
68//! // Access document metadata
69//! assert_eq!(metadata.document.title, Some("My Article".to_string()));
70//! assert_eq!(metadata.document.language, Some("en".to_string()));
71//!
72//! // Access headers
73//! assert_eq!(metadata.headers.len(), 1);
74//! assert_eq!(metadata.headers[0].level, 1);
75//! assert_eq!(metadata.headers[0].id, Some("main".to_string()));
76//!
77//! // Access links
78//! assert_eq!(metadata.links.len(), 1);
79//! assert_eq!(metadata.links[0].link_type, LinkType::External);
80//!
81//! // Access images
82//! assert_eq!(metadata.images.len(), 1);
83//! assert_eq!(metadata.images[0].image_type, ImageType::Relative);
84//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
85//! ```
86//!
87//! ## Selective Extraction
88//!
89//! ```ignore
90//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
91//!
92//! let config = MetadataConfig {
93//!     extract_headers: true,
94//!     extract_links: true,
95//!     extract_images: false,  // Skip images
96//!     extract_structured_data: false,  // Skip structured data
97//!     max_structured_data_size: 0,
98//! };
99//!
100//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
101//! assert_eq!(metadata.images.len(), 0);  // Images not extracted
102//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
103//! ```
104//!
105//! ## Analyzing Link Types
106//!
107//! ```ignore
108//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
109//! use html_to_markdown_rs::metadata::LinkType;
110//!
111//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
112//!
113//! for link in &metadata.links {
114//!     match link.link_type {
115//!         LinkType::External => println!("External: {}", link.href),
116//!         LinkType::Internal => println!("Internal: {}", link.href),
117//!         LinkType::Anchor => println!("Anchor: {}", link.href),
118//!         LinkType::Email => println!("Email: {}", link.href),
119//!         _ => {}
120//!     }
121//! }
122//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
123//! ```
124//!
125//! # Serialization
126//!
127//! All types in this module support serialization via `serde` when the `metadata` feature is enabled.
128//! This enables easy export to JSON, YAML, or other formats:
129//!
130//! ```ignore
131//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
132//!
133//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
134//! let json = serde_json::to_string_pretty(&metadata)?;
135//! println!("{}", json);
136//! # Ok::<(), Box<dyn std::error::Error>>(())
137//! ```
138
139use std::cell::RefCell;
140use std::collections::BTreeMap;
141use std::rc::Rc;
142
143/// Text directionality of document content.
144///
145/// Corresponds to the HTML `dir` attribute and `bdi` element directionality.
146#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
147#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
148pub enum TextDirection {
149    /// Left-to-right text flow (default for Latin scripts)
150    #[cfg_attr(feature = "metadata", serde(rename = "ltr"))]
151    LeftToRight,
152    /// Right-to-left text flow (Hebrew, Arabic, Urdu, etc.)
153    #[cfg_attr(feature = "metadata", serde(rename = "rtl"))]
154    RightToLeft,
155    /// Automatic directionality detection
156    #[cfg_attr(feature = "metadata", serde(rename = "auto"))]
157    Auto,
158}
159
160impl std::fmt::Display for TextDirection {
161    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162        match self {
163            Self::LeftToRight => write!(f, "ltr"),
164            Self::RightToLeft => write!(f, "rtl"),
165            Self::Auto => write!(f, "auto"),
166        }
167    }
168}
169
170impl TextDirection {
171    /// Parse a text direction from string value.
172    ///
173    /// # Arguments
174    ///
175    /// * `s` - Direction string ("ltr", "rtl", or "auto")
176    ///
177    /// # Returns
178    ///
179    /// `Some(TextDirection)` if valid, `None` otherwise.
180    ///
181    /// # Examples
182    ///
183    /// ```
184    /// # use html_to_markdown_rs::metadata::TextDirection;
185    /// assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
186    /// assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
187    /// assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
188    /// assert_eq!(TextDirection::parse("invalid"), None);
189    /// ```
190    pub fn parse(s: &str) -> Option<Self> {
191        match s.to_lowercase().as_str() {
192            "ltr" => Some(Self::LeftToRight),
193            "rtl" => Some(Self::RightToLeft),
194            "auto" => Some(Self::Auto),
195            _ => None,
196        }
197    }
198}
199
200/// Link classification based on href value and document context.
201///
202/// Used to categorize links during extraction for filtering and analysis.
203#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
204#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
205#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
206pub enum LinkType {
207    /// Anchor link within same document (href starts with #)
208    Anchor,
209    /// Internal link within same domain
210    Internal,
211    /// External link to different domain
212    External,
213    /// Email link (mailto:)
214    Email,
215    /// Phone link (tel:)
216    Phone,
217    /// Other protocol or unclassifiable
218    Other,
219}
220
221impl std::fmt::Display for LinkType {
222    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
223        match self {
224            Self::Anchor => write!(f, "anchor"),
225            Self::Internal => write!(f, "internal"),
226            Self::External => write!(f, "external"),
227            Self::Email => write!(f, "email"),
228            Self::Phone => write!(f, "phone"),
229            Self::Other => write!(f, "other"),
230        }
231    }
232}
233
234/// Image source classification for proper handling and processing.
235///
236/// Determines whether an image is embedded (data URI), inline SVG, external, or relative.
237#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
238#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
239#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
240pub enum ImageType {
241    /// Data URI embedded image (base64 or other encoding)
242    DataUri,
243    /// Inline SVG element
244    InlineSvg,
245    /// External image URL (http/https)
246    External,
247    /// Relative image path
248    Relative,
249}
250
251impl std::fmt::Display for ImageType {
252    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
253        match self {
254            Self::DataUri => write!(f, "data_uri"),
255            Self::InlineSvg => write!(f, "inline_svg"),
256            Self::External => write!(f, "external"),
257            Self::Relative => write!(f, "relative"),
258        }
259    }
260}
261
262/// Structured data format type.
263///
264/// Identifies the schema/format used for structured data markup.
265#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
266#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
267#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
268pub enum StructuredDataType {
269    /// JSON-LD (JSON for Linking Data) script blocks
270    #[cfg_attr(feature = "metadata", serde(rename = "json_ld"))]
271    JsonLd,
272    /// HTML5 Microdata attributes (itemscope, itemtype, itemprop)
273    Microdata,
274    /// RDF in Attributes (RDFa) markup
275    #[cfg_attr(feature = "metadata", serde(rename = "rdfa"))]
276    RDFa,
277}
278
279impl std::fmt::Display for StructuredDataType {
280    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
281        match self {
282            Self::JsonLd => write!(f, "json_ld"),
283            Self::Microdata => write!(f, "microdata"),
284            Self::RDFa => write!(f, "rdfa"),
285        }
286    }
287}
288
289/// Document-level metadata extracted from `<head>` and top-level elements.
290///
291/// Contains all metadata typically used by search engines, social media platforms,
292/// and browsers for document indexing and presentation.
293///
294/// # Examples
295///
296/// ```
297/// # use html_to_markdown_rs::metadata::DocumentMetadata;
298/// let doc = DocumentMetadata {
299///     title: Some("My Article".to_string()),
300///     description: Some("A great article about Rust".to_string()),
301///     keywords: vec!["rust".to_string(), "programming".to_string()],
302///     ..Default::default()
303/// };
304///
305/// assert_eq!(doc.title, Some("My Article".to_string()));
306/// ```
307#[derive(Debug, Clone, Default)]
308#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
309pub struct DocumentMetadata {
310    /// Document title from `<title>` tag
311    pub title: Option<String>,
312
313    /// Document description from `<meta name="description">` tag
314    pub description: Option<String>,
315
316    /// Document keywords from `<meta name="keywords">` tag, split on commas
317    pub keywords: Vec<String>,
318
319    /// Document author from `<meta name="author">` tag
320    pub author: Option<String>,
321
322    /// Canonical URL from `<link rel="canonical">` tag
323    pub canonical_url: Option<String>,
324
325    /// Base URL from `<base href="">` tag for resolving relative URLs
326    pub base_href: Option<String>,
327
328    /// Document language from `lang` attribute
329    pub language: Option<String>,
330
331    /// Document text direction from `dir` attribute
332    pub text_direction: Option<TextDirection>,
333
334    /// Open Graph metadata (og:* properties) for social media
335    /// Keys like "title", "description", "image", "url", etc.
336    pub open_graph: BTreeMap<String, String>,
337
338    /// Twitter Card metadata (twitter:* properties)
339    /// Keys like "card", "site", "creator", "title", "description", "image", etc.
340    pub twitter_card: BTreeMap<String, String>,
341
342    /// Additional meta tags not covered by specific fields
343    /// Keys are meta name/property attributes, values are content
344    pub meta_tags: BTreeMap<String, String>,
345}
346
347/// Header element metadata with hierarchy tracking.
348///
349/// Captures heading elements (h1-h6) with their text content, identifiers,
350/// and position in the document structure.
351///
352/// # Examples
353///
354/// ```
355/// # use html_to_markdown_rs::metadata::HeaderMetadata;
356/// let header = HeaderMetadata {
357///     level: 1,
358///     text: "Main Title".to_string(),
359///     id: Some("main-title".to_string()),
360///     depth: 0,
361///     html_offset: 145,
362/// };
363///
364/// assert_eq!(header.level, 1);
365/// assert!(header.is_valid());
366/// ```
367#[derive(Debug, Clone)]
368#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
369pub struct HeaderMetadata {
370    /// Header level: 1 (h1) through 6 (h6)
371    pub level: u8,
372
373    /// Normalized text content of the header
374    pub text: String,
375
376    /// HTML id attribute if present
377    pub id: Option<String>,
378
379    /// Document tree depth at the header element
380    pub depth: usize,
381
382    /// Byte offset in original HTML document
383    pub html_offset: usize,
384}
385
386impl HeaderMetadata {
387    /// Validate that the header level is within valid range (1-6).
388    ///
389    /// # Returns
390    ///
391    /// `true` if level is 1-6, `false` otherwise.
392    ///
393    /// # Examples
394    ///
395    /// ```
396    /// # use html_to_markdown_rs::metadata::HeaderMetadata;
397    /// let valid = HeaderMetadata {
398    ///     level: 3,
399    ///     text: "Title".to_string(),
400    ///     id: None,
401    ///     depth: 2,
402    ///     html_offset: 100,
403    /// };
404    /// assert!(valid.is_valid());
405    ///
406    /// let invalid = HeaderMetadata {
407    ///     level: 7,  // Invalid
408    ///     text: "Title".to_string(),
409    ///     id: None,
410    ///     depth: 2,
411    ///     html_offset: 100,
412    /// };
413    /// assert!(!invalid.is_valid());
414    /// ```
415    pub fn is_valid(&self) -> bool {
416        self.level >= 1 && self.level <= 6
417    }
418}
419
420/// Hyperlink metadata with categorization and attributes.
421///
422/// Represents `<a>` elements with parsed href values, text content, and link type classification.
423///
424/// # Examples
425///
426/// ```
427/// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
428/// let link = LinkMetadata {
429///     href: "https://example.com".to_string(),
430///     text: "Example".to_string(),
431///     title: Some("Visit Example".to_string()),
432///     link_type: LinkType::External,
433///     rel: vec!["nofollow".to_string()],
434///     attributes: Default::default(),
435/// };
436///
437/// assert_eq!(link.link_type, LinkType::External);
438/// assert_eq!(link.text, "Example");
439/// ```
440#[derive(Debug, Clone)]
441#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
442pub struct LinkMetadata {
443    /// The href URL value
444    pub href: String,
445
446    /// Link text content (normalized, concatenated if mixed with elements)
447    pub text: String,
448
449    /// Optional title attribute (often shown as tooltip)
450    pub title: Option<String>,
451
452    /// Link type classification
453    pub link_type: LinkType,
454
455    /// Rel attribute values (e.g., "nofollow", "stylesheet", "canonical")
456    pub rel: Vec<String>,
457
458    /// Additional HTML attributes
459    pub attributes: BTreeMap<String, String>,
460}
461
462impl LinkMetadata {
463    /// Classify a link based on href value.
464    ///
465    /// # Arguments
466    ///
467    /// * `href` - The href attribute value
468    ///
469    /// # Returns
470    ///
471    /// Appropriate [`LinkType`] based on protocol and content.
472    ///
473    /// # Examples
474    ///
475    /// ```
476    /// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
477    /// assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
478    /// assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
479    /// assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
480    /// assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
481    /// ```
482    pub fn classify_link(href: &str) -> LinkType {
483        if href.starts_with('#') {
484            LinkType::Anchor
485        } else if href.starts_with("mailto:") {
486            LinkType::Email
487        } else if href.starts_with("tel:") {
488            LinkType::Phone
489        } else if href.starts_with("http://") || href.starts_with("https://") {
490            LinkType::External
491        } else if href.starts_with('/') || href.starts_with("../") || href.starts_with("./") {
492            LinkType::Internal
493        } else {
494            LinkType::Other
495        }
496    }
497}
498
499/// Image metadata with source and dimensions.
500///
501/// Captures `<img>` elements and inline `<svg>` elements with metadata
502/// for image analysis and optimization.
503///
504/// # Examples
505///
506/// ```
507/// # use html_to_markdown_rs::metadata::{ImageMetadata, ImageType};
508/// let img = ImageMetadata {
509///     src: "https://example.com/image.jpg".to_string(),
510///     alt: Some("An example image".to_string()),
511///     title: Some("Example".to_string()),
512///     dimensions: Some((800, 600)),
513///     image_type: ImageType::External,
514///     attributes: Default::default(),
515/// };
516///
517/// assert_eq!(img.image_type, ImageType::External);
518/// ```
519#[derive(Debug, Clone)]
520#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
521pub struct ImageMetadata {
522    /// Image source (URL, data URI, or SVG content identifier)
523    pub src: String,
524
525    /// Alternative text from alt attribute (for accessibility)
526    pub alt: Option<String>,
527
528    /// Title attribute (often shown as tooltip)
529    pub title: Option<String>,
530
531    /// Image dimensions as (width, height) if available
532    pub dimensions: Option<(u32, u32)>,
533
534    /// Image type classification
535    pub image_type: ImageType,
536
537    /// Additional HTML attributes
538    pub attributes: BTreeMap<String, String>,
539}
540
541/// Structured data block (JSON-LD, Microdata, or RDFa).
542///
543/// Represents machine-readable structured data found in the document.
544/// JSON-LD blocks are collected as raw JSON strings for flexibility.
545///
546/// # Examples
547///
548/// ```
549/// # use html_to_markdown_rs::metadata::{StructuredData, StructuredDataType};
550/// let schema = StructuredData {
551///     data_type: StructuredDataType::JsonLd,
552///     raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
553///     schema_type: Some("Article".to_string()),
554/// };
555///
556/// assert_eq!(schema.data_type, StructuredDataType::JsonLd);
557/// ```
558#[derive(Debug, Clone)]
559#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
560pub struct StructuredData {
561    /// Type of structured data (JSON-LD, Microdata, RDFa)
562    pub data_type: StructuredDataType,
563
564    /// Raw JSON string (for JSON-LD) or serialized representation
565    pub raw_json: String,
566
567    /// Schema type if detectable (e.g., "Article", "Event", "Product")
568    pub schema_type: Option<String>,
569}
570
571/// Default maximum size for structured data extraction (1 MB)
572pub const DEFAULT_MAX_STRUCTURED_DATA_SIZE: usize = 1_000_000;
573
574/// Configuration for metadata extraction granularity.
575///
576/// Controls which metadata types are extracted and size limits for safety.
577///
578/// # Examples
579///
580/// ```
581/// # use html_to_markdown_rs::metadata::MetadataConfig;
582/// let config = MetadataConfig {
583///     extract_document: true,
584///     extract_headers: true,
585///     extract_links: true,
586///     extract_images: true,
587///     extract_structured_data: true,
588///     max_structured_data_size: 1_000_000,
589/// };
590///
591/// assert!(config.extract_headers);
592/// ```
593#[derive(Debug, Clone)]
594#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
595pub struct MetadataConfig {
596    /// Extract document-level metadata (title, description, author, etc.)
597    pub extract_document: bool,
598
599    /// Extract h1-h6 header elements and their hierarchy
600    pub extract_headers: bool,
601
602    /// Extract anchor (a) elements as links with type classification
603    pub extract_links: bool,
604
605    /// Extract image elements and data URIs
606    pub extract_images: bool,
607
608    /// Extract structured data (JSON-LD, Microdata, RDFa)
609    pub extract_structured_data: bool,
610
611    /// Maximum total size of structured data to collect (bytes)
612    /// Prevents memory exhaustion on malformed or adversarial documents
613    pub max_structured_data_size: usize,
614}
615
616impl Default for MetadataConfig {
617    /// Create default metadata configuration.
618    ///
619    /// Defaults to extracting all metadata types with 1MB limit on structured data.
620    fn default() -> Self {
621        Self {
622            extract_document: true,
623            extract_headers: true,
624            extract_links: true,
625            extract_images: true,
626            extract_structured_data: true,
627            max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
628        }
629    }
630}
631
632/// Comprehensive metadata extraction result from HTML document.
633///
634/// Contains all extracted metadata types in a single structure,
635/// suitable for serialization and transmission across language boundaries.
636///
637/// # Examples
638///
639/// ```
640/// # use html_to_markdown_rs::metadata::ExtendedMetadata;
641/// let metadata = ExtendedMetadata {
642///     document: Default::default(),
643///     headers: Vec::new(),
644///     links: Vec::new(),
645///     images: Vec::new(),
646///     structured_data: Vec::new(),
647/// };
648///
649/// assert!(metadata.headers.is_empty());
650/// ```
651#[derive(Debug, Clone, Default)]
652#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
653pub struct ExtendedMetadata {
654    /// Document-level metadata (title, description, canonical, etc.)
655    pub document: DocumentMetadata,
656
657    /// Extracted header elements with hierarchy
658    pub headers: Vec<HeaderMetadata>,
659
660    /// Extracted hyperlinks with type classification
661    pub links: Vec<LinkMetadata>,
662
663    /// Extracted images with source and dimensions
664    pub images: Vec<ImageMetadata>,
665
666    /// Extracted structured data blocks
667    pub structured_data: Vec<StructuredData>,
668}
669
670/// Internal metadata collector for single-pass extraction.
671///
672/// Follows the [`InlineImageCollector`](crate::inline_images::InlineImageCollector) pattern
673/// for efficient metadata extraction during tree traversal. Maintains state for:
674/// - Document metadata from head elements
675/// - Header hierarchy tracking
676/// - Link accumulation
677/// - Structured data collection
678/// - Language and directionality attributes
679///
680/// # Architecture
681///
682/// The collector is designed to be:
683/// - **Performant**: Pre-allocated collections, minimal cloning
684/// - **Single-pass**: Collects during main tree walk without separate passes
685/// - **Optional**: Zero overhead when disabled via feature flags
686/// - **Type-safe**: Strict separation of collection and result types
687///
688/// # Internal State
689///
690/// - `head_metadata`: Raw metadata pairs from head element
691/// - `headers`: Collected header elements
692/// - `header_stack`: For tracking nesting depth
693/// - `links`: Collected link elements
694/// - `base_href`: Base URL for relative link resolution
695/// - `json_ld`: JSON-LD script block contents
696/// - `lang`: Document language
697/// - `dir`: Document text direction
698#[derive(Debug)]
699#[allow(dead_code)]
700pub(crate) struct MetadataCollector {
701    head_metadata: BTreeMap<String, String>,
702    headers: Vec<HeaderMetadata>,
703    header_stack: Vec<usize>,
704    links: Vec<LinkMetadata>,
705    images: Vec<ImageMetadata>,
706    json_ld: Vec<String>,
707    structured_data_size: usize,
708    config: MetadataConfig,
709    lang: Option<String>,
710    dir: Option<String>,
711}
712
713#[allow(dead_code)]
714impl MetadataCollector {
715    /// Create a new metadata collector with configuration.
716    ///
717    /// Pre-allocates collections based on typical document sizes
718    /// for efficient append operations during traversal.
719    ///
720    /// # Arguments
721    ///
722    /// * `config` - Extraction configuration specifying which types to collect
723    ///
724    /// # Returns
725    ///
726    /// A new collector ready for use during tree traversal.
727    ///
728    /// # Examples
729    ///
730    /// ```ignore
731    /// let config = MetadataConfig::default();
732    /// let collector = MetadataCollector::new(config);
733    /// ```
734    pub(crate) fn new(config: MetadataConfig) -> Self {
735        Self {
736            head_metadata: BTreeMap::new(),
737            headers: Vec::with_capacity(32),
738            header_stack: Vec::with_capacity(6),
739            links: Vec::with_capacity(64),
740            images: Vec::with_capacity(16),
741            json_ld: Vec::with_capacity(4),
742            structured_data_size: 0,
743            config,
744            lang: None,
745            dir: None,
746        }
747    }
748
749    /// Add a header element to the collection.
750    ///
751    /// Validates that level is in range 1-6 and tracks hierarchy via depth.
752    ///
753    /// # Arguments
754    ///
755    /// * `level` - Header level (1-6)
756    /// * `text` - Normalized header text content
757    /// * `id` - Optional HTML id attribute
758    /// * `depth` - Current document nesting depth
759    /// * `html_offset` - Byte offset in original HTML
760    pub(crate) fn add_header(&mut self, level: u8, text: String, id: Option<String>, depth: usize, html_offset: usize) {
761        if !self.config.extract_headers {
762            return;
763        }
764
765        if !(1..=6).contains(&level) {
766            return;
767        }
768
769        let header = HeaderMetadata {
770            level,
771            text,
772            id,
773            depth,
774            html_offset,
775        };
776
777        self.headers.push(header);
778    }
779
780    /// Add a link element to the collection.
781    ///
782    /// Classifies the link based on href value and stores with metadata.
783    ///
784    /// # Arguments
785    ///
786    /// * `href` - The href attribute value
787    /// * `text` - Link text content
788    /// * `title` - Optional title attribute
789    /// * `rel` - Comma/space-separated rel attribute value
790    /// * `attributes` - Additional attributes to capture (e.g., data-* or aria-* values)
791    pub(crate) fn add_link(
792        &mut self,
793        href: String,
794        text: String,
795        title: Option<String>,
796        rel: Option<String>,
797        attributes: BTreeMap<String, String>,
798    ) {
799        if !self.config.extract_links {
800            return;
801        }
802
803        let link_type = LinkMetadata::classify_link(&href);
804
805        let rel_vec = rel
806            .map(|r| r.split_whitespace().map(|s| s.to_string()).collect::<Vec<_>>())
807            .unwrap_or_default();
808
809        let link = LinkMetadata {
810            href,
811            text,
812            title,
813            link_type,
814            rel: rel_vec,
815            attributes,
816        };
817
818        self.links.push(link);
819    }
820
821    /// Add an image element to the collection.
822    ///
823    /// # Arguments
824    ///
825    /// * `src` - Image source (URL or data URI)
826    /// * `alt` - Optional alt text
827    /// * `title` - Optional title attribute
828    /// * `dimensions` - Optional (width, height) tuple
829    pub(crate) fn add_image(
830        &mut self,
831        src: String,
832        alt: Option<String>,
833        title: Option<String>,
834        dimensions: Option<(u32, u32)>,
835        attributes: BTreeMap<String, String>,
836    ) {
837        if !self.config.extract_images {
838            return;
839        }
840
841        let image_type = if src.starts_with("data:") {
842            ImageType::DataUri
843        } else if src.starts_with("http://") || src.starts_with("https://") {
844            ImageType::External
845        } else if src.starts_with('<') && src.contains("svg") {
846            ImageType::InlineSvg
847        } else {
848            ImageType::Relative
849        };
850
851        let image = ImageMetadata {
852            src,
853            alt,
854            title,
855            dimensions,
856            image_type,
857            attributes,
858        };
859
860        self.images.push(image);
861    }
862
863    /// Add a JSON-LD structured data block.
864    ///
865    /// Accumulates JSON content with size validation against configured limits.
866    ///
867    /// # Arguments
868    ///
869    /// * `json_content` - Raw JSON string content
870    pub(crate) fn add_json_ld(&mut self, json_content: String) {
871        if !self.config.extract_structured_data {
872            return;
873        }
874
875        let content_size = json_content.len();
876        if self.structured_data_size + content_size > self.config.max_structured_data_size {
877            return;
878        }
879
880        self.structured_data_size += content_size;
881        self.json_ld.push(json_content);
882    }
883
884    /// Set document head metadata from extracted head section.
885    ///
886    /// Merges metadata pairs from head elements (meta, title, link, etc.)
887    /// into the collector's head metadata store.
888    ///
889    /// # Arguments
890    ///
891    /// * `metadata` - BTreeMap of metadata key-value pairs
892    pub(crate) fn set_head_metadata(&mut self, metadata: BTreeMap<String, String>) {
893        self.head_metadata.extend(metadata);
894    }
895
896    /// Set document language attribute.
897    ///
898    /// Usually from `lang` attribute on `<html>` or `<body>` tag.
899    /// Only sets if not already set (first occurrence wins).
900    ///
901    /// # Arguments
902    ///
903    /// * `lang` - Language code (e.g., "en", "es", "fr")
904    pub(crate) fn set_language(&mut self, lang: String) {
905        if self.lang.is_none() {
906            self.lang = Some(lang);
907        }
908    }
909
910    /// Set document text direction attribute.
911    ///
912    /// Usually from `dir` attribute on `<html>` or `<body>` tag.
913    /// Only sets if not already set (first occurrence wins).
914    ///
915    /// # Arguments
916    ///
917    /// * `dir` - Direction string ("ltr", "rtl", or "auto")
918    pub(crate) fn set_text_direction(&mut self, dir: String) {
919        if self.dir.is_none() {
920            self.dir = Some(dir);
921        }
922    }
923
924    /// Extract document metadata from collected head metadata.
925    ///
926    /// Parses head metadata into structured document metadata,
927    /// handling special cases like Open Graph, Twitter Card, keywords, etc.
928    #[allow(dead_code)]
929    fn extract_document_metadata(&self) -> DocumentMetadata {
930        let mut doc = DocumentMetadata::default();
931
932        for (raw_key, value) in &self.head_metadata {
933            let mut key = raw_key.to_lowercase();
934
935            if let Some(stripped) = key.strip_prefix("meta-") {
936                key = stripped.to_string();
937            }
938
939            if key.contains(':') {
940                key = key.replace(':', "-");
941            }
942
943            match key.as_str() {
944                "title" => doc.title = Some(value.clone()),
945                "description" => doc.description = Some(value.clone()),
946                "author" => doc.author = Some(value.clone()),
947                "canonical" => doc.canonical_url = Some(value.clone()),
948                "base" | "base-href" => doc.base_href = Some(value.clone()),
949                key if key.starts_with("og-") => {
950                    let og_key = key.trim_start_matches("og-").replace('-', "_");
951                    doc.open_graph.insert(og_key, value.clone());
952                }
953                key if key.starts_with("twitter-") => {
954                    let tw_key = key.trim_start_matches("twitter-").replace('-', "_");
955                    doc.twitter_card.insert(tw_key, value.clone());
956                }
957                "keywords" => {
958                    doc.keywords = value
959                        .split(',')
960                        .map(|s| s.trim().to_string())
961                        .filter(|s| !s.is_empty())
962                        .collect();
963                }
964                _ => {
965                    doc.meta_tags.insert(key.clone(), value.clone());
966                }
967            }
968        }
969
970        if let Some(ref lang) = self.lang {
971            doc.language = Some(lang.clone());
972        }
973
974        if let Some(ref dir) = self.dir {
975            if let Some(parsed_dir) = TextDirection::parse(dir) {
976                doc.text_direction = Some(parsed_dir);
977            }
978        }
979
980        doc
981    }
982
983    /// Extract structured data blocks into StructuredData items.
984    #[allow(dead_code)]
985    fn extract_structured_data(&self) -> Vec<StructuredData> {
986        let mut result = Vec::with_capacity(self.json_ld.len());
987
988        for json_str in &self.json_ld {
989            let schema_type = serde_json::from_str::<serde_json::Value>(json_str)
990                .ok()
991                .and_then(|v| v.get("@type").and_then(|t| t.as_str().map(|s| s.to_string())));
992
993            result.push(StructuredData {
994                data_type: StructuredDataType::JsonLd,
995                raw_json: json_str.clone(),
996                schema_type,
997            });
998        }
999
1000        result
1001    }
1002
1003    /// Finish collection and return all extracted metadata.
1004    ///
1005    /// Performs final processing, validation, and consolidation of all
1006    /// collected data into the [`ExtendedMetadata`] output structure.
1007    ///
1008    /// # Returns
1009    ///
1010    /// Complete [`ExtendedMetadata`] with all extracted information.
1011    #[allow(dead_code)]
1012    pub(crate) fn finish(self) -> ExtendedMetadata {
1013        let structured_data = self.extract_structured_data();
1014        let document = self.extract_document_metadata();
1015
1016        ExtendedMetadata {
1017            document,
1018            headers: self.headers,
1019            links: self.links,
1020            images: self.images,
1021            structured_data,
1022        }
1023    }
1024
1025    /// Categorize links by type for analysis and filtering.
1026    ///
1027    /// Separates collected links into groups by [`LinkType`].
1028    /// This is an analysis helper method; actual categorization happens during add_link.
1029    ///
1030    /// # Returns
1031    ///
1032    /// BTreeMap with LinkType as key and Vec of matching LinkMetadata as value.
1033    #[allow(dead_code)]
1034    pub(crate) fn categorize_links(&self) -> BTreeMap<String, Vec<&LinkMetadata>> {
1035        let mut categorized: BTreeMap<String, Vec<&LinkMetadata>> = BTreeMap::new();
1036
1037        for link in &self.links {
1038            let category = link.link_type.to_string();
1039            categorized.entry(category).or_default().push(link);
1040        }
1041
1042        categorized
1043    }
1044
1045    /// Count headers by level for structural analysis.
1046    ///
1047    /// Returns count of headers at each level (1-6).
1048    ///
1049    /// # Returns
1050    ///
1051    /// BTreeMap with level as string key and count as value.
1052    #[allow(dead_code)]
1053    pub(crate) fn header_counts(&self) -> BTreeMap<String, usize> {
1054        let mut counts: BTreeMap<String, usize> = BTreeMap::new();
1055
1056        for header in &self.headers {
1057            *counts.entry(header.level.to_string()).or_insert(0) += 1;
1058        }
1059
1060        counts
1061    }
1062}
1063
1064/// Handle to a metadata collector via reference-counted mutable cell.
1065///
1066/// Used internally for sharing collector state across the tree traversal.
1067/// Matches the pattern used for [`InlineImageCollector`](crate::inline_images::InlineImageCollector).
1068///
1069/// # Examples
1070///
1071/// ```ignore
1072/// let collector = MetadataCollector::new(MetadataConfig::default());
1073/// let handle = Rc::new(RefCell::new(collector));
1074///
1075/// // In tree walk, can be passed and borrowed
1076/// handle.borrow_mut().add_header(1, "Title".to_string(), None, 0, 100);
1077///
1078/// let metadata = handle.take().finish();
1079/// ```
1080#[allow(dead_code)]
1081pub(crate) type MetadataCollectorHandle = Rc<RefCell<MetadataCollector>>;
1082
1083#[cfg(test)]
1084mod tests {
1085    use super::*;
1086
1087    #[test]
1088    fn test_text_direction_parse() {
1089        assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
1090        assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
1091        assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
1092        assert_eq!(TextDirection::parse("invalid"), None);
1093        assert_eq!(TextDirection::parse("LTR"), Some(TextDirection::LeftToRight));
1094    }
1095
1096    #[test]
1097    fn test_text_direction_display() {
1098        assert_eq!(TextDirection::LeftToRight.to_string(), "ltr");
1099        assert_eq!(TextDirection::RightToLeft.to_string(), "rtl");
1100        assert_eq!(TextDirection::Auto.to_string(), "auto");
1101    }
1102
1103    #[test]
1104    fn test_link_classification() {
1105        assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
1106        assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
1107        assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
1108        assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
1109        assert_eq!(LinkMetadata::classify_link("http://example.com"), LinkType::External);
1110        assert_eq!(LinkMetadata::classify_link("/path/to/page"), LinkType::Internal);
1111        assert_eq!(LinkMetadata::classify_link("../relative"), LinkType::Internal);
1112        assert_eq!(LinkMetadata::classify_link("./same"), LinkType::Internal);
1113    }
1114
1115    #[test]
1116    fn test_header_validation() {
1117        let valid = HeaderMetadata {
1118            level: 3,
1119            text: "Title".to_string(),
1120            id: None,
1121            depth: 2,
1122            html_offset: 100,
1123        };
1124        assert!(valid.is_valid());
1125
1126        let invalid_high = HeaderMetadata {
1127            level: 7,
1128            text: "Title".to_string(),
1129            id: None,
1130            depth: 2,
1131            html_offset: 100,
1132        };
1133        assert!(!invalid_high.is_valid());
1134
1135        let invalid_low = HeaderMetadata {
1136            level: 0,
1137            text: "Title".to_string(),
1138            id: None,
1139            depth: 2,
1140            html_offset: 100,
1141        };
1142        assert!(!invalid_low.is_valid());
1143    }
1144
1145    #[test]
1146    fn test_metadata_collector_new() {
1147        let config = MetadataConfig::default();
1148        let collector = MetadataCollector::new(config);
1149
1150        assert_eq!(collector.headers.capacity(), 32);
1151        assert_eq!(collector.links.capacity(), 64);
1152        assert_eq!(collector.images.capacity(), 16);
1153        assert_eq!(collector.json_ld.capacity(), 4);
1154    }
1155
1156    #[test]
1157    fn test_metadata_collector_add_header() {
1158        let config = MetadataConfig::default();
1159        let mut collector = MetadataCollector::new(config);
1160
1161        collector.add_header(1, "Title".to_string(), Some("title".to_string()), 0, 100);
1162        assert_eq!(collector.headers.len(), 1);
1163
1164        let header = &collector.headers[0];
1165        assert_eq!(header.level, 1);
1166        assert_eq!(header.text, "Title");
1167        assert_eq!(header.id, Some("title".to_string()));
1168
1169        collector.add_header(7, "Invalid".to_string(), None, 0, 200);
1170        assert_eq!(collector.headers.len(), 1);
1171    }
1172
1173    #[test]
1174    fn test_metadata_collector_add_link() {
1175        let config = MetadataConfig::default();
1176        let mut collector = MetadataCollector::new(config);
1177
1178        collector.add_link(
1179            "https://example.com".to_string(),
1180            "Example".to_string(),
1181            Some("Visit".to_string()),
1182            Some("nofollow external".to_string()),
1183            BTreeMap::from([("data-id".to_string(), "example".to_string())]),
1184        );
1185
1186        assert_eq!(collector.links.len(), 1);
1187
1188        let link = &collector.links[0];
1189        assert_eq!(link.href, "https://example.com");
1190        assert_eq!(link.text, "Example");
1191        assert_eq!(link.link_type, LinkType::External);
1192        assert_eq!(link.rel, vec!["nofollow", "external"]);
1193        assert_eq!(link.attributes.get("data-id"), Some(&"example".to_string()));
1194    }
1195
1196    #[test]
1197    fn test_metadata_collector_respects_config() {
1198        let config = MetadataConfig {
1199            extract_document: false,
1200            extract_headers: false,
1201            extract_links: false,
1202            extract_images: false,
1203            extract_structured_data: false,
1204            max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
1205        };
1206        let mut collector = MetadataCollector::new(config);
1207
1208        collector.add_header(1, "Title".to_string(), None, 0, 100);
1209        collector.add_link(
1210            "https://example.com".to_string(),
1211            "Link".to_string(),
1212            None,
1213            None,
1214            BTreeMap::new(),
1215        );
1216        collector.add_image(
1217            "https://example.com/img.jpg".to_string(),
1218            None,
1219            None,
1220            None,
1221            BTreeMap::new(),
1222        );
1223        collector.add_json_ld("{}".to_string());
1224
1225        assert!(collector.headers.is_empty());
1226        assert!(collector.links.is_empty());
1227        assert!(collector.images.is_empty());
1228        assert!(collector.json_ld.is_empty());
1229    }
1230
1231    #[test]
1232    fn test_metadata_collector_finish() {
1233        let config = MetadataConfig::default();
1234        let mut collector = MetadataCollector::new(config);
1235
1236        collector.set_language("en".to_string());
1237        collector.add_header(1, "Main Title".to_string(), None, 0, 100);
1238        collector.add_link(
1239            "https://example.com".to_string(),
1240            "Example".to_string(),
1241            None,
1242            None,
1243            BTreeMap::new(),
1244        );
1245
1246        let metadata = collector.finish();
1247
1248        assert_eq!(metadata.document.language, Some("en".to_string()));
1249        assert_eq!(metadata.headers.len(), 1);
1250        assert_eq!(metadata.links.len(), 1);
1251    }
1252
1253    #[test]
1254    fn test_document_metadata_default() {
1255        let doc = DocumentMetadata::default();
1256
1257        assert!(doc.title.is_none());
1258        assert!(doc.description.is_none());
1259        assert!(doc.keywords.is_empty());
1260        assert!(doc.open_graph.is_empty());
1261        assert!(doc.twitter_card.is_empty());
1262        assert!(doc.meta_tags.is_empty());
1263    }
1264
1265    #[test]
1266    fn test_metadata_config_default() {
1267        let config = MetadataConfig::default();
1268
1269        assert!(config.extract_headers);
1270        assert!(config.extract_links);
1271        assert!(config.extract_images);
1272        assert!(config.extract_structured_data);
1273        assert_eq!(config.max_structured_data_size, DEFAULT_MAX_STRUCTURED_DATA_SIZE);
1274    }
1275
1276    #[test]
1277    fn test_image_type_classification() {
1278        let data_uri = ImageMetadata {
1279            src: "data:image/png;base64,iVBORw0KG...".to_string(),
1280            alt: None,
1281            title: None,
1282            dimensions: None,
1283            image_type: ImageType::DataUri,
1284            attributes: BTreeMap::new(),
1285        };
1286        assert_eq!(data_uri.image_type, ImageType::DataUri);
1287
1288        let external = ImageMetadata {
1289            src: "https://example.com/image.jpg".to_string(),
1290            alt: None,
1291            title: None,
1292            dimensions: None,
1293            image_type: ImageType::External,
1294            attributes: BTreeMap::new(),
1295        };
1296        assert_eq!(external.image_type, ImageType::External);
1297    }
1298
1299    #[test]
1300    fn test_link_type_display() {
1301        assert_eq!(LinkType::Anchor.to_string(), "anchor");
1302        assert_eq!(LinkType::Internal.to_string(), "internal");
1303        assert_eq!(LinkType::External.to_string(), "external");
1304        assert_eq!(LinkType::Email.to_string(), "email");
1305        assert_eq!(LinkType::Phone.to_string(), "phone");
1306        assert_eq!(LinkType::Other.to_string(), "other");
1307    }
1308
1309    #[test]
1310    fn test_structured_data_type_display() {
1311        assert_eq!(StructuredDataType::JsonLd.to_string(), "json_ld");
1312        assert_eq!(StructuredDataType::Microdata.to_string(), "microdata");
1313        assert_eq!(StructuredDataType::RDFa.to_string(), "rdfa");
1314    }
1315
1316    #[test]
1317    fn test_categorize_links() {
1318        let config = MetadataConfig::default();
1319        let mut collector = MetadataCollector::new(config);
1320
1321        collector.add_link("#anchor".to_string(), "Anchor".to_string(), None, None, BTreeMap::new());
1322        collector.add_link(
1323            "https://example.com".to_string(),
1324            "External".to_string(),
1325            None,
1326            None,
1327            BTreeMap::new(),
1328        );
1329        collector.add_link(
1330            "mailto:test@example.com".to_string(),
1331            "Email".to_string(),
1332            None,
1333            None,
1334            BTreeMap::new(),
1335        );
1336
1337        let categorized = collector.categorize_links();
1338
1339        assert_eq!(categorized.get("anchor").map(|v| v.len()), Some(1));
1340        assert_eq!(categorized.get("external").map(|v| v.len()), Some(1));
1341        assert_eq!(categorized.get("email").map(|v| v.len()), Some(1));
1342    }
1343
1344    #[test]
1345    fn test_header_counts() {
1346        let config = MetadataConfig::default();
1347        let mut collector = MetadataCollector::new(config);
1348
1349        collector.add_header(1, "H1".to_string(), None, 0, 100);
1350        collector.add_header(2, "H2".to_string(), None, 1, 200);
1351        collector.add_header(2, "H2b".to_string(), None, 1, 300);
1352        collector.add_header(3, "H3".to_string(), None, 2, 400);
1353
1354        let counts = collector.header_counts();
1355
1356        assert_eq!(counts.get("1").copied(), Some(1));
1357        assert_eq!(counts.get("2").copied(), Some(2));
1358        assert_eq!(counts.get("3").copied(), Some(1));
1359    }
1360}
html_to_markdown_rs/metadata.rs

html_to_markdown_rs/
metadata.rs