html_to_markdown_rs/metadata.rs
1//! Metadata extraction for HTML to Markdown conversion.
2//!
3//! This module provides comprehensive, type-safe metadata extraction during HTML-to-Markdown
4//! conversion, enabling content analysis, SEO optimization, and document indexing workflows.
5//! Metadata includes:
6//! - **Document metadata**: Title, description, author, language, canonical URL, Open Graph, Twitter Card
7//! - **Headers**: Heading elements (h1-h6) with hierarchy, IDs, and positions
8//! - **Links**: Hyperlinks with type classification (anchor, internal, external, email, phone)
9//! - **Images**: Image elements with source, alt text, dimensions, and type (data URI, external, etc.)
10//! - **Structured data**: JSON-LD, Microdata, and RDFa blocks
11//!
12//! The implementation follows a single-pass collector pattern for zero-overhead extraction
13//! when metadata features are disabled.
14//!
15//! # Architecture
16//!
17//! Metadata extraction uses the [`MetadataCollector`] pattern (similar to [`InlineImageCollector`]):
18//! - **Single-pass collection**: Metadata is gathered during the primary tree traversal without additional passes
19//! - **Zero overhead when disabled**: Entire module can be compiled out via feature flags
20//! - **Configurable granularity**: Use [`MetadataConfig`] to select which metadata types to extract
21//! - **Type-safe APIs**: All metadata types are enum-based with exhaustive matching
22//! - **Memory-bounded**: Size limits prevent memory exhaustion from adversarial documents
23//! - **Pre-allocated buffers**: Typical documents (32 headers, 64 links, 16 images) handled efficiently
24//!
25//! # Type Overview
26//!
27//! ## Enumerations
28//!
29//! - [`TextDirection`]: Document directionality (LTR, RTL, Auto)
30//! - [`LinkType`]: Link classification (Anchor, Internal, External, Email, Phone, Other)
31//! - [`ImageType`]: Image source type (DataUri, External, Relative, InlineSvg)
32//! - [`StructuredDataType`]: Structured data format (JsonLd, Microdata, RDFa)
33//!
34//! ## Structures
35//!
36//! - [`DocumentMetadata`]: Head-level metadata with maps for Open Graph and Twitter Card
37//! - [`HeaderMetadata`]: Heading element with level (1-6), text, ID, hierarchy depth, and position
38//! - [`LinkMetadata`]: Hyperlink with href, text, title, type, rel attributes, and custom attributes
39//! - [`ImageMetadata`]: Image element with src, alt, title, dimensions, type, and attributes
40//! - [`StructuredData`]: Structured data block with type and raw JSON
41//! - [`MetadataConfig`]: Configuration controlling extraction granularity and size limits
42//! - [`ExtendedMetadata`]: Top-level result containing all extracted metadata
43//!
44//! # Examples
45//!
46//! ## Basic Usage with convert_with_metadata
47//!
48//! ```ignore
49//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
50//!
51//! let html = r#"
52//! <html lang="en">
53//! <head>
54//! <title>My Article</title>
55//! <meta name="description" content="An interesting read">
56//! </head>
57//! <body>
58//! <h1 id="main">Title</h1>
59//! <a href="https://example.com">External Link</a>
60//! <img src="photo.jpg" alt="A photo">
61//! </body>
62//! </html>
63//! "#;
64//!
65//! let config = MetadataConfig::default();
66//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
67//!
68//! // Access document metadata
69//! assert_eq!(metadata.document.title, Some("My Article".to_string()));
70//! assert_eq!(metadata.document.language, Some("en".to_string()));
71//!
72//! // Access headers
73//! assert_eq!(metadata.headers.len(), 1);
74//! assert_eq!(metadata.headers[0].level, 1);
75//! assert_eq!(metadata.headers[0].id, Some("main".to_string()));
76//!
77//! // Access links
78//! assert_eq!(metadata.links.len(), 1);
79//! assert_eq!(metadata.links[0].link_type, LinkType::External);
80//!
81//! // Access images
82//! assert_eq!(metadata.images.len(), 1);
83//! assert_eq!(metadata.images[0].image_type, ImageType::Relative);
84//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
85//! ```
86//!
87//! ## Selective Extraction
88//!
89//! ```ignore
90//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
91//!
92//! let config = MetadataConfig {
93//! extract_headers: true,
94//! extract_links: true,
95//! extract_images: false, // Skip images
96//! extract_structured_data: false, // Skip structured data
97//! max_structured_data_size: 0,
98//! };
99//!
100//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
101//! assert_eq!(metadata.images.len(), 0); // Images not extracted
102//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
103//! ```
104//!
105//! ## Analyzing Link Types
106//!
107//! ```ignore
108//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
109//! use html_to_markdown_rs::metadata::LinkType;
110//!
111//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
112//!
113//! for link in &metadata.links {
114//! match link.link_type {
115//! LinkType::External => println!("External: {}", link.href),
116//! LinkType::Internal => println!("Internal: {}", link.href),
117//! LinkType::Anchor => println!("Anchor: {}", link.href),
118//! LinkType::Email => println!("Email: {}", link.href),
119//! _ => {}
120//! }
121//! }
122//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
123//! ```
124//!
125//! # Serialization
126//!
127//! All types in this module support serialization via `serde` when the `metadata` feature is enabled.
128//! This enables easy export to JSON, YAML, or other formats:
129//!
130//! ```ignore
131//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
132//!
133//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
134//! let json = serde_json::to_string_pretty(&metadata)?;
135//! println!("{}", json);
136//! # Ok::<(), Box<dyn std::error::Error>>(())
137//! ```
138
139use std::cell::RefCell;
140use std::collections::BTreeMap;
141use std::rc::Rc;
142
143/// Text directionality of document content.
144///
145/// Corresponds to the HTML `dir` attribute and `bdi` element directionality.
146#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
147#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
148pub enum TextDirection {
149 /// Left-to-right text flow (default for Latin scripts)
150 #[cfg_attr(feature = "metadata", serde(rename = "ltr"))]
151 LeftToRight,
152 /// Right-to-left text flow (Hebrew, Arabic, Urdu, etc.)
153 #[cfg_attr(feature = "metadata", serde(rename = "rtl"))]
154 RightToLeft,
155 /// Automatic directionality detection
156 #[cfg_attr(feature = "metadata", serde(rename = "auto"))]
157 Auto,
158}
159
160impl std::fmt::Display for TextDirection {
161 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162 match self {
163 Self::LeftToRight => write!(f, "ltr"),
164 Self::RightToLeft => write!(f, "rtl"),
165 Self::Auto => write!(f, "auto"),
166 }
167 }
168}
169
170impl TextDirection {
171 /// Parse a text direction from string value.
172 ///
173 /// # Arguments
174 ///
175 /// * `s` - Direction string ("ltr", "rtl", or "auto")
176 ///
177 /// # Returns
178 ///
179 /// `Some(TextDirection)` if valid, `None` otherwise.
180 ///
181 /// # Examples
182 ///
183 /// ```
184 /// # use html_to_markdown_rs::metadata::TextDirection;
185 /// assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
186 /// assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
187 /// assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
188 /// assert_eq!(TextDirection::parse("invalid"), None);
189 /// ```
190 pub fn parse(s: &str) -> Option<Self> {
191 match s.to_lowercase().as_str() {
192 "ltr" => Some(Self::LeftToRight),
193 "rtl" => Some(Self::RightToLeft),
194 "auto" => Some(Self::Auto),
195 _ => None,
196 }
197 }
198}
199
200/// Link classification based on href value and document context.
201///
202/// Used to categorize links during extraction for filtering and analysis.
203#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
204#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
205#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
206pub enum LinkType {
207 /// Anchor link within same document (href starts with #)
208 Anchor,
209 /// Internal link within same domain
210 Internal,
211 /// External link to different domain
212 External,
213 /// Email link (mailto:)
214 Email,
215 /// Phone link (tel:)
216 Phone,
217 /// Other protocol or unclassifiable
218 Other,
219}
220
221impl std::fmt::Display for LinkType {
222 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
223 match self {
224 Self::Anchor => write!(f, "anchor"),
225 Self::Internal => write!(f, "internal"),
226 Self::External => write!(f, "external"),
227 Self::Email => write!(f, "email"),
228 Self::Phone => write!(f, "phone"),
229 Self::Other => write!(f, "other"),
230 }
231 }
232}
233
234/// Image source classification for proper handling and processing.
235///
236/// Determines whether an image is embedded (data URI), inline SVG, external, or relative.
237#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
238#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
239#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
240pub enum ImageType {
241 /// Data URI embedded image (base64 or other encoding)
242 DataUri,
243 /// Inline SVG element
244 InlineSvg,
245 /// External image URL (http/https)
246 External,
247 /// Relative image path
248 Relative,
249}
250
251impl std::fmt::Display for ImageType {
252 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
253 match self {
254 Self::DataUri => write!(f, "data_uri"),
255 Self::InlineSvg => write!(f, "inline_svg"),
256 Self::External => write!(f, "external"),
257 Self::Relative => write!(f, "relative"),
258 }
259 }
260}
261
262/// Structured data format type.
263///
264/// Identifies the schema/format used for structured data markup.
265#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
266#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
267#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
268pub enum StructuredDataType {
269 /// JSON-LD (JSON for Linking Data) script blocks
270 #[cfg_attr(feature = "metadata", serde(rename = "json_ld"))]
271 JsonLd,
272 /// HTML5 Microdata attributes (itemscope, itemtype, itemprop)
273 Microdata,
274 /// RDF in Attributes (RDFa) markup
275 #[cfg_attr(feature = "metadata", serde(rename = "rdfa"))]
276 RDFa,
277}
278
279impl std::fmt::Display for StructuredDataType {
280 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
281 match self {
282 Self::JsonLd => write!(f, "json_ld"),
283 Self::Microdata => write!(f, "microdata"),
284 Self::RDFa => write!(f, "rdfa"),
285 }
286 }
287}
288
289/// Document-level metadata extracted from `<head>` and top-level elements.
290///
291/// Contains all metadata typically used by search engines, social media platforms,
292/// and browsers for document indexing and presentation.
293///
294/// # Examples
295///
296/// ```
297/// # use html_to_markdown_rs::metadata::DocumentMetadata;
298/// let doc = DocumentMetadata {
299/// title: Some("My Article".to_string()),
300/// description: Some("A great article about Rust".to_string()),
301/// keywords: vec!["rust".to_string(), "programming".to_string()],
302/// ..Default::default()
303/// };
304///
305/// assert_eq!(doc.title, Some("My Article".to_string()));
306/// ```
307#[derive(Debug, Clone, Default)]
308#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
309pub struct DocumentMetadata {
310 /// Document title from `<title>` tag
311 pub title: Option<String>,
312
313 /// Document description from `<meta name="description">` tag
314 pub description: Option<String>,
315
316 /// Document keywords from `<meta name="keywords">` tag, split on commas
317 pub keywords: Vec<String>,
318
319 /// Document author from `<meta name="author">` tag
320 pub author: Option<String>,
321
322 /// Canonical URL from `<link rel="canonical">` tag
323 pub canonical_url: Option<String>,
324
325 /// Base URL from `<base href="">` tag for resolving relative URLs
326 pub base_href: Option<String>,
327
328 /// Document language from `lang` attribute
329 pub language: Option<String>,
330
331 /// Document text direction from `dir` attribute
332 pub text_direction: Option<TextDirection>,
333
334 /// Open Graph metadata (og:* properties) for social media
335 /// Keys like "title", "description", "image", "url", etc.
336 pub open_graph: BTreeMap<String, String>,
337
338 /// Twitter Card metadata (twitter:* properties)
339 /// Keys like "card", "site", "creator", "title", "description", "image", etc.
340 pub twitter_card: BTreeMap<String, String>,
341
342 /// Additional meta tags not covered by specific fields
343 /// Keys are meta name/property attributes, values are content
344 pub meta_tags: BTreeMap<String, String>,
345}
346
347/// Header element metadata with hierarchy tracking.
348///
349/// Captures heading elements (h1-h6) with their text content, identifiers,
350/// and position in the document structure.
351///
352/// # Examples
353///
354/// ```
355/// # use html_to_markdown_rs::metadata::HeaderMetadata;
356/// let header = HeaderMetadata {
357/// level: 1,
358/// text: "Main Title".to_string(),
359/// id: Some("main-title".to_string()),
360/// depth: 0,
361/// html_offset: 145,
362/// };
363///
364/// assert_eq!(header.level, 1);
365/// assert!(header.is_valid());
366/// ```
367#[derive(Debug, Clone)]
368#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
369pub struct HeaderMetadata {
370 /// Header level: 1 (h1) through 6 (h6)
371 pub level: u8,
372
373 /// Normalized text content of the header
374 pub text: String,
375
376 /// HTML id attribute if present
377 pub id: Option<String>,
378
379 /// Document tree depth at the header element
380 pub depth: usize,
381
382 /// Byte offset in original HTML document
383 pub html_offset: usize,
384}
385
386impl HeaderMetadata {
387 /// Validate that the header level is within valid range (1-6).
388 ///
389 /// # Returns
390 ///
391 /// `true` if level is 1-6, `false` otherwise.
392 ///
393 /// # Examples
394 ///
395 /// ```
396 /// # use html_to_markdown_rs::metadata::HeaderMetadata;
397 /// let valid = HeaderMetadata {
398 /// level: 3,
399 /// text: "Title".to_string(),
400 /// id: None,
401 /// depth: 2,
402 /// html_offset: 100,
403 /// };
404 /// assert!(valid.is_valid());
405 ///
406 /// let invalid = HeaderMetadata {
407 /// level: 7, // Invalid
408 /// text: "Title".to_string(),
409 /// id: None,
410 /// depth: 2,
411 /// html_offset: 100,
412 /// };
413 /// assert!(!invalid.is_valid());
414 /// ```
415 pub fn is_valid(&self) -> bool {
416 self.level >= 1 && self.level <= 6
417 }
418}
419
420/// Hyperlink metadata with categorization and attributes.
421///
422/// Represents `<a>` elements with parsed href values, text content, and link type classification.
423///
424/// # Examples
425///
426/// ```
427/// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
428/// let link = LinkMetadata {
429/// href: "https://example.com".to_string(),
430/// text: "Example".to_string(),
431/// title: Some("Visit Example".to_string()),
432/// link_type: LinkType::External,
433/// rel: vec!["nofollow".to_string()],
434/// attributes: Default::default(),
435/// };
436///
437/// assert_eq!(link.link_type, LinkType::External);
438/// assert_eq!(link.text, "Example");
439/// ```
440#[derive(Debug, Clone)]
441#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
442pub struct LinkMetadata {
443 /// The href URL value
444 pub href: String,
445
446 /// Link text content (normalized, concatenated if mixed with elements)
447 pub text: String,
448
449 /// Optional title attribute (often shown as tooltip)
450 pub title: Option<String>,
451
452 /// Link type classification
453 pub link_type: LinkType,
454
455 /// Rel attribute values (e.g., "nofollow", "stylesheet", "canonical")
456 pub rel: Vec<String>,
457
458 /// Additional HTML attributes
459 pub attributes: BTreeMap<String, String>,
460}
461
462impl LinkMetadata {
463 /// Classify a link based on href value.
464 ///
465 /// # Arguments
466 ///
467 /// * `href` - The href attribute value
468 ///
469 /// # Returns
470 ///
471 /// Appropriate [`LinkType`] based on protocol and content.
472 ///
473 /// # Examples
474 ///
475 /// ```
476 /// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
477 /// assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
478 /// assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
479 /// assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
480 /// assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
481 /// ```
482 pub fn classify_link(href: &str) -> LinkType {
483 if href.starts_with('#') {
484 LinkType::Anchor
485 } else if href.starts_with("mailto:") {
486 LinkType::Email
487 } else if href.starts_with("tel:") {
488 LinkType::Phone
489 } else if href.starts_with("http://") || href.starts_with("https://") {
490 LinkType::External
491 } else if href.starts_with('/') || href.starts_with("../") || href.starts_with("./") {
492 LinkType::Internal
493 } else {
494 LinkType::Other
495 }
496 }
497}
498
499/// Image metadata with source and dimensions.
500///
501/// Captures `<img>` elements and inline `<svg>` elements with metadata
502/// for image analysis and optimization.
503///
504/// # Examples
505///
506/// ```
507/// # use html_to_markdown_rs::metadata::{ImageMetadata, ImageType};
508/// let img = ImageMetadata {
509/// src: "https://example.com/image.jpg".to_string(),
510/// alt: Some("An example image".to_string()),
511/// title: Some("Example".to_string()),
512/// dimensions: Some((800, 600)),
513/// image_type: ImageType::External,
514/// attributes: Default::default(),
515/// };
516///
517/// assert_eq!(img.image_type, ImageType::External);
518/// ```
519#[derive(Debug, Clone)]
520#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
521pub struct ImageMetadata {
522 /// Image source (URL, data URI, or SVG content identifier)
523 pub src: String,
524
525 /// Alternative text from alt attribute (for accessibility)
526 pub alt: Option<String>,
527
528 /// Title attribute (often shown as tooltip)
529 pub title: Option<String>,
530
531 /// Image dimensions as (width, height) if available
532 pub dimensions: Option<(u32, u32)>,
533
534 /// Image type classification
535 pub image_type: ImageType,
536
537 /// Additional HTML attributes
538 pub attributes: BTreeMap<String, String>,
539}
540
541/// Structured data block (JSON-LD, Microdata, or RDFa).
542///
543/// Represents machine-readable structured data found in the document.
544/// JSON-LD blocks are collected as raw JSON strings for flexibility.
545///
546/// # Examples
547///
548/// ```
549/// # use html_to_markdown_rs::metadata::{StructuredData, StructuredDataType};
550/// let schema = StructuredData {
551/// data_type: StructuredDataType::JsonLd,
552/// raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
553/// schema_type: Some("Article".to_string()),
554/// };
555///
556/// assert_eq!(schema.data_type, StructuredDataType::JsonLd);
557/// ```
558#[derive(Debug, Clone)]
559#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
560pub struct StructuredData {
561 /// Type of structured data (JSON-LD, Microdata, RDFa)
562 pub data_type: StructuredDataType,
563
564 /// Raw JSON string (for JSON-LD) or serialized representation
565 pub raw_json: String,
566
567 /// Schema type if detectable (e.g., "Article", "Event", "Product")
568 pub schema_type: Option<String>,
569}
570
571/// Default maximum size for structured data extraction (1 MB)
572pub const DEFAULT_MAX_STRUCTURED_DATA_SIZE: usize = 1_000_000;
573
574/// Configuration for metadata extraction granularity.
575///
576/// Controls which metadata types are extracted and size limits for safety.
577///
578/// # Examples
579///
580/// ```
581/// # use html_to_markdown_rs::metadata::MetadataConfig;
582/// let config = MetadataConfig {
583/// extract_document: true,
584/// extract_headers: true,
585/// extract_links: true,
586/// extract_images: true,
587/// extract_structured_data: true,
588/// max_structured_data_size: 1_000_000,
589/// };
590///
591/// assert!(config.extract_headers);
592/// ```
593#[derive(Debug, Clone)]
594#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
595pub struct MetadataConfig {
596 /// Extract document-level metadata (title, description, author, etc.)
597 pub extract_document: bool,
598
599 /// Extract h1-h6 header elements and their hierarchy
600 pub extract_headers: bool,
601
602 /// Extract anchor (a) elements as links with type classification
603 pub extract_links: bool,
604
605 /// Extract image elements and data URIs
606 pub extract_images: bool,
607
608 /// Extract structured data (JSON-LD, Microdata, RDFa)
609 pub extract_structured_data: bool,
610
611 /// Maximum total size of structured data to collect (bytes)
612 /// Prevents memory exhaustion on malformed or adversarial documents
613 pub max_structured_data_size: usize,
614}
615
616impl Default for MetadataConfig {
617 /// Create default metadata configuration.
618 ///
619 /// Defaults to extracting all metadata types with 1MB limit on structured data.
620 fn default() -> Self {
621 Self {
622 extract_document: true,
623 extract_headers: true,
624 extract_links: true,
625 extract_images: true,
626 extract_structured_data: true,
627 max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
628 }
629 }
630}
631
632/// Comprehensive metadata extraction result from HTML document.
633///
634/// Contains all extracted metadata types in a single structure,
635/// suitable for serialization and transmission across language boundaries.
636///
637/// # Examples
638///
639/// ```
640/// # use html_to_markdown_rs::metadata::ExtendedMetadata;
641/// let metadata = ExtendedMetadata {
642/// document: Default::default(),
643/// headers: Vec::new(),
644/// links: Vec::new(),
645/// images: Vec::new(),
646/// structured_data: Vec::new(),
647/// };
648///
649/// assert!(metadata.headers.is_empty());
650/// ```
651#[derive(Debug, Clone, Default)]
652#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
653pub struct ExtendedMetadata {
654 /// Document-level metadata (title, description, canonical, etc.)
655 pub document: DocumentMetadata,
656
657 /// Extracted header elements with hierarchy
658 pub headers: Vec<HeaderMetadata>,
659
660 /// Extracted hyperlinks with type classification
661 pub links: Vec<LinkMetadata>,
662
663 /// Extracted images with source and dimensions
664 pub images: Vec<ImageMetadata>,
665
666 /// Extracted structured data blocks
667 pub structured_data: Vec<StructuredData>,
668}
669
670/// Internal metadata collector for single-pass extraction.
671///
672/// Follows the [`InlineImageCollector`](crate::inline_images::InlineImageCollector) pattern
673/// for efficient metadata extraction during tree traversal. Maintains state for:
674/// - Document metadata from head elements
675/// - Header hierarchy tracking
676/// - Link accumulation
677/// - Structured data collection
678/// - Language and directionality attributes
679///
680/// # Architecture
681///
682/// The collector is designed to be:
683/// - **Performant**: Pre-allocated collections, minimal cloning
684/// - **Single-pass**: Collects during main tree walk without separate passes
685/// - **Optional**: Zero overhead when disabled via feature flags
686/// - **Type-safe**: Strict separation of collection and result types
687///
688/// # Internal State
689///
690/// - `head_metadata`: Raw metadata pairs from head element
691/// - `headers`: Collected header elements
692/// - `header_stack`: For tracking nesting depth
693/// - `links`: Collected link elements
694/// - `base_href`: Base URL for relative link resolution
695/// - `json_ld`: JSON-LD script block contents
696/// - `lang`: Document language
697/// - `dir`: Document text direction
698#[derive(Debug)]
699#[allow(dead_code)]
700pub(crate) struct MetadataCollector {
701 head_metadata: BTreeMap<String, String>,
702 headers: Vec<HeaderMetadata>,
703 header_stack: Vec<usize>,
704 links: Vec<LinkMetadata>,
705 images: Vec<ImageMetadata>,
706 json_ld: Vec<String>,
707 structured_data_size: usize,
708 config: MetadataConfig,
709 lang: Option<String>,
710 dir: Option<String>,
711}
712
713#[allow(dead_code)]
714impl MetadataCollector {
715 /// Create a new metadata collector with configuration.
716 ///
717 /// Pre-allocates collections based on typical document sizes
718 /// for efficient append operations during traversal.
719 ///
720 /// # Arguments
721 ///
722 /// * `config` - Extraction configuration specifying which types to collect
723 ///
724 /// # Returns
725 ///
726 /// A new collector ready for use during tree traversal.
727 ///
728 /// # Examples
729 ///
730 /// ```ignore
731 /// let config = MetadataConfig::default();
732 /// let collector = MetadataCollector::new(config);
733 /// ```
734 pub(crate) fn new(config: MetadataConfig) -> Self {
735 Self {
736 head_metadata: BTreeMap::new(),
737 headers: Vec::with_capacity(32),
738 header_stack: Vec::with_capacity(6),
739 links: Vec::with_capacity(64),
740 images: Vec::with_capacity(16),
741 json_ld: Vec::with_capacity(4),
742 structured_data_size: 0,
743 config,
744 lang: None,
745 dir: None,
746 }
747 }
748
749 /// Add a header element to the collection.
750 ///
751 /// Validates that level is in range 1-6 and tracks hierarchy via depth.
752 ///
753 /// # Arguments
754 ///
755 /// * `level` - Header level (1-6)
756 /// * `text` - Normalized header text content
757 /// * `id` - Optional HTML id attribute
758 /// * `depth` - Current document nesting depth
759 /// * `html_offset` - Byte offset in original HTML
760 pub(crate) fn add_header(&mut self, level: u8, text: String, id: Option<String>, depth: usize, html_offset: usize) {
761 if !self.config.extract_headers {
762 return;
763 }
764
765 if !(1..=6).contains(&level) {
766 return;
767 }
768
769 let header = HeaderMetadata {
770 level,
771 text,
772 id,
773 depth,
774 html_offset,
775 };
776
777 self.headers.push(header);
778 }
779
780 /// Add a link element to the collection.
781 ///
782 /// Classifies the link based on href value and stores with metadata.
783 ///
784 /// # Arguments
785 ///
786 /// * `href` - The href attribute value
787 /// * `text` - Link text content
788 /// * `title` - Optional title attribute
789 /// * `rel` - Comma/space-separated rel attribute value
790 /// * `attributes` - Additional attributes to capture (e.g., data-* or aria-* values)
791 pub(crate) fn add_link(
792 &mut self,
793 href: String,
794 text: String,
795 title: Option<String>,
796 rel: Option<String>,
797 attributes: BTreeMap<String, String>,
798 ) {
799 if !self.config.extract_links {
800 return;
801 }
802
803 let link_type = LinkMetadata::classify_link(&href);
804
805 let rel_vec = rel
806 .map(|r| r.split_whitespace().map(|s| s.to_string()).collect::<Vec<_>>())
807 .unwrap_or_default();
808
809 let link = LinkMetadata {
810 href,
811 text,
812 title,
813 link_type,
814 rel: rel_vec,
815 attributes,
816 };
817
818 self.links.push(link);
819 }
820
821 /// Add an image element to the collection.
822 ///
823 /// # Arguments
824 ///
825 /// * `src` - Image source (URL or data URI)
826 /// * `alt` - Optional alt text
827 /// * `title` - Optional title attribute
828 /// * `dimensions` - Optional (width, height) tuple
829 pub(crate) fn add_image(
830 &mut self,
831 src: String,
832 alt: Option<String>,
833 title: Option<String>,
834 dimensions: Option<(u32, u32)>,
835 attributes: BTreeMap<String, String>,
836 ) {
837 if !self.config.extract_images {
838 return;
839 }
840
841 let image_type = if src.starts_with("data:") {
842 ImageType::DataUri
843 } else if src.starts_with("http://") || src.starts_with("https://") {
844 ImageType::External
845 } else if src.starts_with('<') && src.contains("svg") {
846 ImageType::InlineSvg
847 } else {
848 ImageType::Relative
849 };
850
851 let image = ImageMetadata {
852 src,
853 alt,
854 title,
855 dimensions,
856 image_type,
857 attributes,
858 };
859
860 self.images.push(image);
861 }
862
863 /// Add a JSON-LD structured data block.
864 ///
865 /// Accumulates JSON content with size validation against configured limits.
866 ///
867 /// # Arguments
868 ///
869 /// * `json_content` - Raw JSON string content
870 pub(crate) fn add_json_ld(&mut self, json_content: String) {
871 if !self.config.extract_structured_data {
872 return;
873 }
874
875 let content_size = json_content.len();
876 if self.structured_data_size + content_size > self.config.max_structured_data_size {
877 return;
878 }
879
880 self.structured_data_size += content_size;
881 self.json_ld.push(json_content);
882 }
883
884 /// Set document head metadata from extracted head section.
885 ///
886 /// Merges metadata pairs from head elements (meta, title, link, etc.)
887 /// into the collector's head metadata store.
888 ///
889 /// # Arguments
890 ///
891 /// * `metadata` - BTreeMap of metadata key-value pairs
892 pub(crate) fn set_head_metadata(&mut self, metadata: BTreeMap<String, String>) {
893 self.head_metadata.extend(metadata);
894 }
895
896 /// Set document language attribute.
897 ///
898 /// Usually from `lang` attribute on `<html>` or `<body>` tag.
899 /// Only sets if not already set (first occurrence wins).
900 ///
901 /// # Arguments
902 ///
903 /// * `lang` - Language code (e.g., "en", "es", "fr")
904 pub(crate) fn set_language(&mut self, lang: String) {
905 if self.lang.is_none() {
906 self.lang = Some(lang);
907 }
908 }
909
910 /// Set document text direction attribute.
911 ///
912 /// Usually from `dir` attribute on `<html>` or `<body>` tag.
913 /// Only sets if not already set (first occurrence wins).
914 ///
915 /// # Arguments
916 ///
917 /// * `dir` - Direction string ("ltr", "rtl", or "auto")
918 pub(crate) fn set_text_direction(&mut self, dir: String) {
919 if self.dir.is_none() {
920 self.dir = Some(dir);
921 }
922 }
923
924 /// Extract document metadata from collected head metadata.
925 ///
926 /// Parses head metadata into structured document metadata,
927 /// handling special cases like Open Graph, Twitter Card, keywords, etc.
928 #[allow(dead_code)]
929 fn extract_document_metadata(&self) -> DocumentMetadata {
930 let mut doc = DocumentMetadata::default();
931
932 for (raw_key, value) in &self.head_metadata {
933 let mut key = raw_key.to_lowercase();
934
935 if let Some(stripped) = key.strip_prefix("meta-") {
936 key = stripped.to_string();
937 }
938
939 if key.contains(':') {
940 key = key.replace(':', "-");
941 }
942
943 match key.as_str() {
944 "title" => doc.title = Some(value.clone()),
945 "description" => doc.description = Some(value.clone()),
946 "author" => doc.author = Some(value.clone()),
947 "canonical" => doc.canonical_url = Some(value.clone()),
948 "base" | "base-href" => doc.base_href = Some(value.clone()),
949 key if key.starts_with("og-") => {
950 let og_key = key.trim_start_matches("og-").replace('-', "_");
951 doc.open_graph.insert(og_key, value.clone());
952 }
953 key if key.starts_with("twitter-") => {
954 let tw_key = key.trim_start_matches("twitter-").replace('-', "_");
955 doc.twitter_card.insert(tw_key, value.clone());
956 }
957 "keywords" => {
958 doc.keywords = value
959 .split(',')
960 .map(|s| s.trim().to_string())
961 .filter(|s| !s.is_empty())
962 .collect();
963 }
964 _ => {
965 doc.meta_tags.insert(key.clone(), value.clone());
966 }
967 }
968 }
969
970 if let Some(ref lang) = self.lang {
971 doc.language = Some(lang.clone());
972 }
973
974 if let Some(ref dir) = self.dir {
975 if let Some(parsed_dir) = TextDirection::parse(dir) {
976 doc.text_direction = Some(parsed_dir);
977 }
978 }
979
980 doc
981 }
982
983 /// Extract structured data blocks into StructuredData items.
984 #[allow(dead_code)]
985 fn extract_structured_data(&self) -> Vec<StructuredData> {
986 let mut result = Vec::with_capacity(self.json_ld.len());
987
988 for json_str in &self.json_ld {
989 let schema_type = serde_json::from_str::<serde_json::Value>(json_str)
990 .ok()
991 .and_then(|v| v.get("@type").and_then(|t| t.as_str().map(|s| s.to_string())));
992
993 result.push(StructuredData {
994 data_type: StructuredDataType::JsonLd,
995 raw_json: json_str.clone(),
996 schema_type,
997 });
998 }
999
1000 result
1001 }
1002
1003 /// Finish collection and return all extracted metadata.
1004 ///
1005 /// Performs final processing, validation, and consolidation of all
1006 /// collected data into the [`ExtendedMetadata`] output structure.
1007 ///
1008 /// # Returns
1009 ///
1010 /// Complete [`ExtendedMetadata`] with all extracted information.
1011 #[allow(dead_code)]
1012 pub(crate) fn finish(self) -> ExtendedMetadata {
1013 let structured_data = self.extract_structured_data();
1014 let document = self.extract_document_metadata();
1015
1016 ExtendedMetadata {
1017 document,
1018 headers: self.headers,
1019 links: self.links,
1020 images: self.images,
1021 structured_data,
1022 }
1023 }
1024
1025 /// Categorize links by type for analysis and filtering.
1026 ///
1027 /// Separates collected links into groups by [`LinkType`].
1028 /// This is an analysis helper method; actual categorization happens during add_link.
1029 ///
1030 /// # Returns
1031 ///
1032 /// BTreeMap with LinkType as key and Vec of matching LinkMetadata as value.
1033 #[allow(dead_code)]
1034 pub(crate) fn categorize_links(&self) -> BTreeMap<String, Vec<&LinkMetadata>> {
1035 let mut categorized: BTreeMap<String, Vec<&LinkMetadata>> = BTreeMap::new();
1036
1037 for link in &self.links {
1038 let category = link.link_type.to_string();
1039 categorized.entry(category).or_default().push(link);
1040 }
1041
1042 categorized
1043 }
1044
1045 /// Count headers by level for structural analysis.
1046 ///
1047 /// Returns count of headers at each level (1-6).
1048 ///
1049 /// # Returns
1050 ///
1051 /// BTreeMap with level as string key and count as value.
1052 #[allow(dead_code)]
1053 pub(crate) fn header_counts(&self) -> BTreeMap<String, usize> {
1054 let mut counts: BTreeMap<String, usize> = BTreeMap::new();
1055
1056 for header in &self.headers {
1057 *counts.entry(header.level.to_string()).or_insert(0) += 1;
1058 }
1059
1060 counts
1061 }
1062}
1063
1064/// Handle to a metadata collector via reference-counted mutable cell.
1065///
1066/// Used internally for sharing collector state across the tree traversal.
1067/// Matches the pattern used for [`InlineImageCollector`](crate::inline_images::InlineImageCollector).
1068///
1069/// # Examples
1070///
1071/// ```ignore
1072/// let collector = MetadataCollector::new(MetadataConfig::default());
1073/// let handle = Rc::new(RefCell::new(collector));
1074///
1075/// // In tree walk, can be passed and borrowed
1076/// handle.borrow_mut().add_header(1, "Title".to_string(), None, 0, 100);
1077///
1078/// let metadata = handle.take().finish();
1079/// ```
1080#[allow(dead_code)]
1081pub(crate) type MetadataCollectorHandle = Rc<RefCell<MetadataCollector>>;
1082
1083#[cfg(test)]
1084mod tests {
1085 use super::*;
1086
1087 #[test]
1088 fn test_text_direction_parse() {
1089 assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
1090 assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
1091 assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
1092 assert_eq!(TextDirection::parse("invalid"), None);
1093 assert_eq!(TextDirection::parse("LTR"), Some(TextDirection::LeftToRight));
1094 }
1095
1096 #[test]
1097 fn test_text_direction_display() {
1098 assert_eq!(TextDirection::LeftToRight.to_string(), "ltr");
1099 assert_eq!(TextDirection::RightToLeft.to_string(), "rtl");
1100 assert_eq!(TextDirection::Auto.to_string(), "auto");
1101 }
1102
1103 #[test]
1104 fn test_link_classification() {
1105 assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
1106 assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
1107 assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
1108 assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
1109 assert_eq!(LinkMetadata::classify_link("http://example.com"), LinkType::External);
1110 assert_eq!(LinkMetadata::classify_link("/path/to/page"), LinkType::Internal);
1111 assert_eq!(LinkMetadata::classify_link("../relative"), LinkType::Internal);
1112 assert_eq!(LinkMetadata::classify_link("./same"), LinkType::Internal);
1113 }
1114
1115 #[test]
1116 fn test_header_validation() {
1117 let valid = HeaderMetadata {
1118 level: 3,
1119 text: "Title".to_string(),
1120 id: None,
1121 depth: 2,
1122 html_offset: 100,
1123 };
1124 assert!(valid.is_valid());
1125
1126 let invalid_high = HeaderMetadata {
1127 level: 7,
1128 text: "Title".to_string(),
1129 id: None,
1130 depth: 2,
1131 html_offset: 100,
1132 };
1133 assert!(!invalid_high.is_valid());
1134
1135 let invalid_low = HeaderMetadata {
1136 level: 0,
1137 text: "Title".to_string(),
1138 id: None,
1139 depth: 2,
1140 html_offset: 100,
1141 };
1142 assert!(!invalid_low.is_valid());
1143 }
1144
1145 #[test]
1146 fn test_metadata_collector_new() {
1147 let config = MetadataConfig::default();
1148 let collector = MetadataCollector::new(config);
1149
1150 assert_eq!(collector.headers.capacity(), 32);
1151 assert_eq!(collector.links.capacity(), 64);
1152 assert_eq!(collector.images.capacity(), 16);
1153 assert_eq!(collector.json_ld.capacity(), 4);
1154 }
1155
1156 #[test]
1157 fn test_metadata_collector_add_header() {
1158 let config = MetadataConfig::default();
1159 let mut collector = MetadataCollector::new(config);
1160
1161 collector.add_header(1, "Title".to_string(), Some("title".to_string()), 0, 100);
1162 assert_eq!(collector.headers.len(), 1);
1163
1164 let header = &collector.headers[0];
1165 assert_eq!(header.level, 1);
1166 assert_eq!(header.text, "Title");
1167 assert_eq!(header.id, Some("title".to_string()));
1168
1169 collector.add_header(7, "Invalid".to_string(), None, 0, 200);
1170 assert_eq!(collector.headers.len(), 1);
1171 }
1172
1173 #[test]
1174 fn test_metadata_collector_add_link() {
1175 let config = MetadataConfig::default();
1176 let mut collector = MetadataCollector::new(config);
1177
1178 collector.add_link(
1179 "https://example.com".to_string(),
1180 "Example".to_string(),
1181 Some("Visit".to_string()),
1182 Some("nofollow external".to_string()),
1183 BTreeMap::from([("data-id".to_string(), "example".to_string())]),
1184 );
1185
1186 assert_eq!(collector.links.len(), 1);
1187
1188 let link = &collector.links[0];
1189 assert_eq!(link.href, "https://example.com");
1190 assert_eq!(link.text, "Example");
1191 assert_eq!(link.link_type, LinkType::External);
1192 assert_eq!(link.rel, vec!["nofollow", "external"]);
1193 assert_eq!(link.attributes.get("data-id"), Some(&"example".to_string()));
1194 }
1195
1196 #[test]
1197 fn test_metadata_collector_respects_config() {
1198 let config = MetadataConfig {
1199 extract_document: false,
1200 extract_headers: false,
1201 extract_links: false,
1202 extract_images: false,
1203 extract_structured_data: false,
1204 max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
1205 };
1206 let mut collector = MetadataCollector::new(config);
1207
1208 collector.add_header(1, "Title".to_string(), None, 0, 100);
1209 collector.add_link(
1210 "https://example.com".to_string(),
1211 "Link".to_string(),
1212 None,
1213 None,
1214 BTreeMap::new(),
1215 );
1216 collector.add_image(
1217 "https://example.com/img.jpg".to_string(),
1218 None,
1219 None,
1220 None,
1221 BTreeMap::new(),
1222 );
1223 collector.add_json_ld("{}".to_string());
1224
1225 assert!(collector.headers.is_empty());
1226 assert!(collector.links.is_empty());
1227 assert!(collector.images.is_empty());
1228 assert!(collector.json_ld.is_empty());
1229 }
1230
1231 #[test]
1232 fn test_metadata_collector_finish() {
1233 let config = MetadataConfig::default();
1234 let mut collector = MetadataCollector::new(config);
1235
1236 collector.set_language("en".to_string());
1237 collector.add_header(1, "Main Title".to_string(), None, 0, 100);
1238 collector.add_link(
1239 "https://example.com".to_string(),
1240 "Example".to_string(),
1241 None,
1242 None,
1243 BTreeMap::new(),
1244 );
1245
1246 let metadata = collector.finish();
1247
1248 assert_eq!(metadata.document.language, Some("en".to_string()));
1249 assert_eq!(metadata.headers.len(), 1);
1250 assert_eq!(metadata.links.len(), 1);
1251 }
1252
1253 #[test]
1254 fn test_document_metadata_default() {
1255 let doc = DocumentMetadata::default();
1256
1257 assert!(doc.title.is_none());
1258 assert!(doc.description.is_none());
1259 assert!(doc.keywords.is_empty());
1260 assert!(doc.open_graph.is_empty());
1261 assert!(doc.twitter_card.is_empty());
1262 assert!(doc.meta_tags.is_empty());
1263 }
1264
1265 #[test]
1266 fn test_metadata_config_default() {
1267 let config = MetadataConfig::default();
1268
1269 assert!(config.extract_headers);
1270 assert!(config.extract_links);
1271 assert!(config.extract_images);
1272 assert!(config.extract_structured_data);
1273 assert_eq!(config.max_structured_data_size, DEFAULT_MAX_STRUCTURED_DATA_SIZE);
1274 }
1275
1276 #[test]
1277 fn test_image_type_classification() {
1278 let data_uri = ImageMetadata {
1279 src: "data:image/png;base64,iVBORw0KG...".to_string(),
1280 alt: None,
1281 title: None,
1282 dimensions: None,
1283 image_type: ImageType::DataUri,
1284 attributes: BTreeMap::new(),
1285 };
1286 assert_eq!(data_uri.image_type, ImageType::DataUri);
1287
1288 let external = ImageMetadata {
1289 src: "https://example.com/image.jpg".to_string(),
1290 alt: None,
1291 title: None,
1292 dimensions: None,
1293 image_type: ImageType::External,
1294 attributes: BTreeMap::new(),
1295 };
1296 assert_eq!(external.image_type, ImageType::External);
1297 }
1298
1299 #[test]
1300 fn test_link_type_display() {
1301 assert_eq!(LinkType::Anchor.to_string(), "anchor");
1302 assert_eq!(LinkType::Internal.to_string(), "internal");
1303 assert_eq!(LinkType::External.to_string(), "external");
1304 assert_eq!(LinkType::Email.to_string(), "email");
1305 assert_eq!(LinkType::Phone.to_string(), "phone");
1306 assert_eq!(LinkType::Other.to_string(), "other");
1307 }
1308
1309 #[test]
1310 fn test_structured_data_type_display() {
1311 assert_eq!(StructuredDataType::JsonLd.to_string(), "json_ld");
1312 assert_eq!(StructuredDataType::Microdata.to_string(), "microdata");
1313 assert_eq!(StructuredDataType::RDFa.to_string(), "rdfa");
1314 }
1315
1316 #[test]
1317 fn test_categorize_links() {
1318 let config = MetadataConfig::default();
1319 let mut collector = MetadataCollector::new(config);
1320
1321 collector.add_link("#anchor".to_string(), "Anchor".to_string(), None, None, BTreeMap::new());
1322 collector.add_link(
1323 "https://example.com".to_string(),
1324 "External".to_string(),
1325 None,
1326 None,
1327 BTreeMap::new(),
1328 );
1329 collector.add_link(
1330 "mailto:test@example.com".to_string(),
1331 "Email".to_string(),
1332 None,
1333 None,
1334 BTreeMap::new(),
1335 );
1336
1337 let categorized = collector.categorize_links();
1338
1339 assert_eq!(categorized.get("anchor").map(|v| v.len()), Some(1));
1340 assert_eq!(categorized.get("external").map(|v| v.len()), Some(1));
1341 assert_eq!(categorized.get("email").map(|v| v.len()), Some(1));
1342 }
1343
1344 #[test]
1345 fn test_header_counts() {
1346 let config = MetadataConfig::default();
1347 let mut collector = MetadataCollector::new(config);
1348
1349 collector.add_header(1, "H1".to_string(), None, 0, 100);
1350 collector.add_header(2, "H2".to_string(), None, 1, 200);
1351 collector.add_header(2, "H2b".to_string(), None, 1, 300);
1352 collector.add_header(3, "H3".to_string(), None, 2, 400);
1353
1354 let counts = collector.header_counts();
1355
1356 assert_eq!(counts.get("1").copied(), Some(1));
1357 assert_eq!(counts.get("2").copied(), Some(2));
1358 assert_eq!(counts.get("3").copied(), Some(1));
1359 }
1360}