html_to_markdown_rs/
metadata.rs

1//! Metadata extraction for HTML to Markdown conversion.
2//!
3//! This module provides comprehensive, type-safe metadata extraction during HTML-to-Markdown
4//! conversion, enabling content analysis, SEO optimization, and document indexing workflows.
5//! Metadata includes:
6//! - **Document metadata**: Title, description, author, language, canonical URL, Open Graph, Twitter Card
7//! - **Headers**: Heading elements (h1-h6) with hierarchy, IDs, and positions
8//! - **Links**: Hyperlinks with type classification (anchor, internal, external, email, phone)
9//! - **Images**: Image elements with source, alt text, dimensions, and type (data URI, external, etc.)
10//! - **Structured data**: JSON-LD, Microdata, and RDFa blocks
11//!
12//! The implementation follows a single-pass collector pattern for zero-overhead extraction
13//! when metadata features are disabled.
14//!
15//! # Architecture
16//!
17//! Metadata extraction uses the [`MetadataCollector`] pattern (similar to [`InlineImageCollector`]):
18//! - **Single-pass collection**: Metadata is gathered during the primary tree traversal without additional passes
19//! - **Zero overhead when disabled**: Entire module can be compiled out via feature flags
20//! - **Configurable granularity**: Use [`MetadataConfig`] to select which metadata types to extract
21//! - **Type-safe APIs**: All metadata types are enum-based with exhaustive matching
22//! - **Memory-bounded**: Size limits prevent memory exhaustion from adversarial documents
23//! - **Pre-allocated buffers**: Typical documents (32 headers, 64 links, 16 images) handled efficiently
24//!
25//! # Type Overview
26//!
27//! ## Enumerations
28//!
29//! - [`TextDirection`]: Document directionality (LTR, RTL, Auto)
30//! - [`LinkType`]: Link classification (Anchor, Internal, External, Email, Phone, Other)
31//! - [`ImageType`]: Image source type (DataUri, External, Relative, InlineSvg)
32//! - [`StructuredDataType`]: Structured data format (JsonLd, Microdata, RDFa)
33//!
34//! ## Structures
35//!
36//! - [`DocumentMetadata`]: Head-level metadata with maps for Open Graph and Twitter Card
37//! - [`HeaderMetadata`]: Heading element with level (1-6), text, ID, hierarchy depth, and position
38//! - [`LinkMetadata`]: Hyperlink with href, text, title, type, rel attributes, and custom attributes
39//! - [`ImageMetadata`]: Image element with src, alt, title, dimensions, type, and attributes
40//! - [`StructuredData`]: Structured data block with type and raw JSON
41//! - [`MetadataConfig`]: Configuration controlling extraction granularity and size limits
42//! - [`ExtendedMetadata`]: Top-level result containing all extracted metadata
43//!
44//! # Examples
45//!
46//! ## Basic Usage with convert_with_metadata
47//!
48//! ```ignore
49//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
50//!
51//! let html = r#"
52//!   <html lang="en">
53//!     <head>
54//!       <title>My Article</title>
55//!       <meta name="description" content="An interesting read">
56//!     </head>
57//!     <body>
58//!       <h1 id="main">Title</h1>
59//!       <a href="https://example.com">External Link</a>
60//!       <img src="photo.jpg" alt="A photo">
61//!     </body>
62//!   </html>
63//! "#;
64//!
65//! let config = MetadataConfig::default();
66//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
67//!
68//! // Access document metadata
69//! assert_eq!(metadata.document.title, Some("My Article".to_string()));
70//! assert_eq!(metadata.document.language, Some("en".to_string()));
71//!
72//! // Access headers
73//! assert_eq!(metadata.headers.len(), 1);
74//! assert_eq!(metadata.headers[0].level, 1);
75//! assert_eq!(metadata.headers[0].id, Some("main".to_string()));
76//!
77//! // Access links
78//! assert_eq!(metadata.links.len(), 1);
79//! assert_eq!(metadata.links[0].link_type, LinkType::External);
80//!
81//! // Access images
82//! assert_eq!(metadata.images.len(), 1);
83//! assert_eq!(metadata.images[0].image_type, ImageType::Relative);
84//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
85//! ```
86//!
87//! ## Selective Extraction
88//!
89//! ```ignore
90//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
91//!
92//! let config = MetadataConfig {
93//!     extract_headers: true,
94//!     extract_links: true,
95//!     extract_images: false,  // Skip images
96//!     extract_structured_data: false,  // Skip structured data
97//!     max_structured_data_size: 0,
98//! };
99//!
100//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
101//! assert_eq!(metadata.images.len(), 0);  // Images not extracted
102//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
103//! ```
104//!
105//! ## Analyzing Link Types
106//!
107//! ```ignore
108//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
109//! use html_to_markdown_rs::metadata::LinkType;
110//!
111//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
112//!
113//! for link in &metadata.links {
114//!     match link.link_type {
115//!         LinkType::External => println!("External: {}", link.href),
116//!         LinkType::Internal => println!("Internal: {}", link.href),
117//!         LinkType::Anchor => println!("Anchor: {}", link.href),
118//!         LinkType::Email => println!("Email: {}", link.href),
119//!         _ => {}
120//!     }
121//! }
122//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
123//! ```
124//!
125//! # Serialization
126//!
127//! All types in this module support serialization via `serde` when the `metadata` feature is enabled.
128//! This enables easy export to JSON, YAML, or other formats:
129//!
130//! ```ignore
131//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
132//!
133//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
134//! let json = serde_json::to_string_pretty(&metadata)?;
135//! println!("{}", json);
136//! # Ok::<(), Box<dyn std::error::Error>>(())
137//! ```
138
139use std::cell::RefCell;
140use std::collections::BTreeMap;
141use std::rc::Rc;
142
143/// Text directionality of document content.
144///
145/// Corresponds to the HTML `dir` attribute and `bdi` element directionality.
146#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
147#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
148pub enum TextDirection {
149    /// Left-to-right text flow (default for Latin scripts)
150    #[cfg_attr(feature = "metadata", serde(rename = "ltr"))]
151    LeftToRight,
152    /// Right-to-left text flow (Hebrew, Arabic, Urdu, etc.)
153    #[cfg_attr(feature = "metadata", serde(rename = "rtl"))]
154    RightToLeft,
155    /// Automatic directionality detection
156    #[cfg_attr(feature = "metadata", serde(rename = "auto"))]
157    Auto,
158}
159
160impl std::fmt::Display for TextDirection {
161    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162        match self {
163            Self::LeftToRight => write!(f, "ltr"),
164            Self::RightToLeft => write!(f, "rtl"),
165            Self::Auto => write!(f, "auto"),
166        }
167    }
168}
169
170impl TextDirection {
171    /// Parse a text direction from string value.
172    ///
173    /// # Arguments
174    ///
175    /// * `s` - Direction string ("ltr", "rtl", or "auto")
176    ///
177    /// # Returns
178    ///
179    /// `Some(TextDirection)` if valid, `None` otherwise.
180    ///
181    /// # Examples
182    ///
183    /// ```
184    /// # use html_to_markdown_rs::metadata::TextDirection;
185    /// assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
186    /// assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
187    /// assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
188    /// assert_eq!(TextDirection::parse("invalid"), None);
189    /// ```
190    pub fn parse(s: &str) -> Option<Self> {
191        if s.eq_ignore_ascii_case("ltr") {
192            return Some(Self::LeftToRight);
193        }
194        if s.eq_ignore_ascii_case("rtl") {
195            return Some(Self::RightToLeft);
196        }
197        if s.eq_ignore_ascii_case("auto") {
198            return Some(Self::Auto);
199        }
200        None
201    }
202}
203
204/// Link classification based on href value and document context.
205///
206/// Used to categorize links during extraction for filtering and analysis.
207#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
208#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
209#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
210pub enum LinkType {
211    /// Anchor link within same document (href starts with #)
212    Anchor,
213    /// Internal link within same domain
214    Internal,
215    /// External link to different domain
216    External,
217    /// Email link (mailto:)
218    Email,
219    /// Phone link (tel:)
220    Phone,
221    /// Other protocol or unclassifiable
222    Other,
223}
224
225impl std::fmt::Display for LinkType {
226    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
227        match self {
228            Self::Anchor => write!(f, "anchor"),
229            Self::Internal => write!(f, "internal"),
230            Self::External => write!(f, "external"),
231            Self::Email => write!(f, "email"),
232            Self::Phone => write!(f, "phone"),
233            Self::Other => write!(f, "other"),
234        }
235    }
236}
237
238/// Image source classification for proper handling and processing.
239///
240/// Determines whether an image is embedded (data URI), inline SVG, external, or relative.
241#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
242#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
243#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
244pub enum ImageType {
245    /// Data URI embedded image (base64 or other encoding)
246    DataUri,
247    /// Inline SVG element
248    InlineSvg,
249    /// External image URL (http/https)
250    External,
251    /// Relative image path
252    Relative,
253}
254
255impl std::fmt::Display for ImageType {
256    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
257        match self {
258            Self::DataUri => write!(f, "data_uri"),
259            Self::InlineSvg => write!(f, "inline_svg"),
260            Self::External => write!(f, "external"),
261            Self::Relative => write!(f, "relative"),
262        }
263    }
264}
265
266/// Structured data format type.
267///
268/// Identifies the schema/format used for structured data markup.
269#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
270#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
271#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
272pub enum StructuredDataType {
273    /// JSON-LD (JSON for Linking Data) script blocks
274    #[cfg_attr(feature = "metadata", serde(rename = "json_ld"))]
275    JsonLd,
276    /// HTML5 Microdata attributes (itemscope, itemtype, itemprop)
277    Microdata,
278    /// RDF in Attributes (RDFa) markup
279    #[cfg_attr(feature = "metadata", serde(rename = "rdfa"))]
280    RDFa,
281}
282
283impl std::fmt::Display for StructuredDataType {
284    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285        match self {
286            Self::JsonLd => write!(f, "json_ld"),
287            Self::Microdata => write!(f, "microdata"),
288            Self::RDFa => write!(f, "rdfa"),
289        }
290    }
291}
292
293/// Document-level metadata extracted from `<head>` and top-level elements.
294///
295/// Contains all metadata typically used by search engines, social media platforms,
296/// and browsers for document indexing and presentation.
297///
298/// # Examples
299///
300/// ```
301/// # use html_to_markdown_rs::metadata::DocumentMetadata;
302/// let doc = DocumentMetadata {
303///     title: Some("My Article".to_string()),
304///     description: Some("A great article about Rust".to_string()),
305///     keywords: vec!["rust".to_string(), "programming".to_string()],
306///     ..Default::default()
307/// };
308///
309/// assert_eq!(doc.title, Some("My Article".to_string()));
310/// ```
311#[derive(Debug, Clone, Default)]
312#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
313pub struct DocumentMetadata {
314    /// Document title from `<title>` tag
315    pub title: Option<String>,
316
317    /// Document description from `<meta name="description">` tag
318    pub description: Option<String>,
319
320    /// Document keywords from `<meta name="keywords">` tag, split on commas
321    pub keywords: Vec<String>,
322
323    /// Document author from `<meta name="author">` tag
324    pub author: Option<String>,
325
326    /// Canonical URL from `<link rel="canonical">` tag
327    pub canonical_url: Option<String>,
328
329    /// Base URL from `<base href="">` tag for resolving relative URLs
330    pub base_href: Option<String>,
331
332    /// Document language from `lang` attribute
333    pub language: Option<String>,
334
335    /// Document text direction from `dir` attribute
336    pub text_direction: Option<TextDirection>,
337
338    /// Open Graph metadata (og:* properties) for social media
339    /// Keys like "title", "description", "image", "url", etc.
340    pub open_graph: BTreeMap<String, String>,
341
342    /// Twitter Card metadata (twitter:* properties)
343    /// Keys like "card", "site", "creator", "title", "description", "image", etc.
344    pub twitter_card: BTreeMap<String, String>,
345
346    /// Additional meta tags not covered by specific fields
347    /// Keys are meta name/property attributes, values are content
348    pub meta_tags: BTreeMap<String, String>,
349}
350
351/// Header element metadata with hierarchy tracking.
352///
353/// Captures heading elements (h1-h6) with their text content, identifiers,
354/// and position in the document structure.
355///
356/// # Examples
357///
358/// ```
359/// # use html_to_markdown_rs::metadata::HeaderMetadata;
360/// let header = HeaderMetadata {
361///     level: 1,
362///     text: "Main Title".to_string(),
363///     id: Some("main-title".to_string()),
364///     depth: 0,
365///     html_offset: 145,
366/// };
367///
368/// assert_eq!(header.level, 1);
369/// assert!(header.is_valid());
370/// ```
371#[derive(Debug, Clone)]
372#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
373pub struct HeaderMetadata {
374    /// Header level: 1 (h1) through 6 (h6)
375    pub level: u8,
376
377    /// Normalized text content of the header
378    pub text: String,
379
380    /// HTML id attribute if present
381    pub id: Option<String>,
382
383    /// Document tree depth at the header element
384    pub depth: usize,
385
386    /// Byte offset in original HTML document
387    pub html_offset: usize,
388}
389
390impl HeaderMetadata {
391    /// Validate that the header level is within valid range (1-6).
392    ///
393    /// # Returns
394    ///
395    /// `true` if level is 1-6, `false` otherwise.
396    ///
397    /// # Examples
398    ///
399    /// ```
400    /// # use html_to_markdown_rs::metadata::HeaderMetadata;
401    /// let valid = HeaderMetadata {
402    ///     level: 3,
403    ///     text: "Title".to_string(),
404    ///     id: None,
405    ///     depth: 2,
406    ///     html_offset: 100,
407    /// };
408    /// assert!(valid.is_valid());
409    ///
410    /// let invalid = HeaderMetadata {
411    ///     level: 7,  // Invalid
412    ///     text: "Title".to_string(),
413    ///     id: None,
414    ///     depth: 2,
415    ///     html_offset: 100,
416    /// };
417    /// assert!(!invalid.is_valid());
418    /// ```
419    pub fn is_valid(&self) -> bool {
420        self.level >= 1 && self.level <= 6
421    }
422}
423
424/// Hyperlink metadata with categorization and attributes.
425///
426/// Represents `<a>` elements with parsed href values, text content, and link type classification.
427///
428/// # Examples
429///
430/// ```
431/// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
432/// let link = LinkMetadata {
433///     href: "https://example.com".to_string(),
434///     text: "Example".to_string(),
435///     title: Some("Visit Example".to_string()),
436///     link_type: LinkType::External,
437///     rel: vec!["nofollow".to_string()],
438///     attributes: Default::default(),
439/// };
440///
441/// assert_eq!(link.link_type, LinkType::External);
442/// assert_eq!(link.text, "Example");
443/// ```
444#[derive(Debug, Clone)]
445#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
446pub struct LinkMetadata {
447    /// The href URL value
448    pub href: String,
449
450    /// Link text content (normalized, concatenated if mixed with elements)
451    pub text: String,
452
453    /// Optional title attribute (often shown as tooltip)
454    pub title: Option<String>,
455
456    /// Link type classification
457    pub link_type: LinkType,
458
459    /// Rel attribute values (e.g., "nofollow", "stylesheet", "canonical")
460    pub rel: Vec<String>,
461
462    /// Additional HTML attributes
463    pub attributes: BTreeMap<String, String>,
464}
465
466impl LinkMetadata {
467    /// Classify a link based on href value.
468    ///
469    /// # Arguments
470    ///
471    /// * `href` - The href attribute value
472    ///
473    /// # Returns
474    ///
475    /// Appropriate [`LinkType`] based on protocol and content.
476    ///
477    /// # Examples
478    ///
479    /// ```
480    /// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
481    /// assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
482    /// assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
483    /// assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
484    /// assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
485    /// ```
486    pub fn classify_link(href: &str) -> LinkType {
487        if href.starts_with('#') {
488            LinkType::Anchor
489        } else if href.starts_with("mailto:") {
490            LinkType::Email
491        } else if href.starts_with("tel:") {
492            LinkType::Phone
493        } else if href.starts_with("http://") || href.starts_with("https://") {
494            LinkType::External
495        } else if href.starts_with('/') || href.starts_with("../") || href.starts_with("./") {
496            LinkType::Internal
497        } else {
498            LinkType::Other
499        }
500    }
501}
502
503/// Image metadata with source and dimensions.
504///
505/// Captures `<img>` elements and inline `<svg>` elements with metadata
506/// for image analysis and optimization.
507///
508/// # Examples
509///
510/// ```
511/// # use html_to_markdown_rs::metadata::{ImageMetadata, ImageType};
512/// let img = ImageMetadata {
513///     src: "https://example.com/image.jpg".to_string(),
514///     alt: Some("An example image".to_string()),
515///     title: Some("Example".to_string()),
516///     dimensions: Some((800, 600)),
517///     image_type: ImageType::External,
518///     attributes: Default::default(),
519/// };
520///
521/// assert_eq!(img.image_type, ImageType::External);
522/// ```
523#[derive(Debug, Clone)]
524#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
525pub struct ImageMetadata {
526    /// Image source (URL, data URI, or SVG content identifier)
527    pub src: String,
528
529    /// Alternative text from alt attribute (for accessibility)
530    pub alt: Option<String>,
531
532    /// Title attribute (often shown as tooltip)
533    pub title: Option<String>,
534
535    /// Image dimensions as (width, height) if available
536    pub dimensions: Option<(u32, u32)>,
537
538    /// Image type classification
539    pub image_type: ImageType,
540
541    /// Additional HTML attributes
542    pub attributes: BTreeMap<String, String>,
543}
544
545/// Structured data block (JSON-LD, Microdata, or RDFa).
546///
547/// Represents machine-readable structured data found in the document.
548/// JSON-LD blocks are collected as raw JSON strings for flexibility.
549///
550/// # Examples
551///
552/// ```
553/// # use html_to_markdown_rs::metadata::{StructuredData, StructuredDataType};
554/// let schema = StructuredData {
555///     data_type: StructuredDataType::JsonLd,
556///     raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
557///     schema_type: Some("Article".to_string()),
558/// };
559///
560/// assert_eq!(schema.data_type, StructuredDataType::JsonLd);
561/// ```
562#[derive(Debug, Clone)]
563#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
564pub struct StructuredData {
565    /// Type of structured data (JSON-LD, Microdata, RDFa)
566    pub data_type: StructuredDataType,
567
568    /// Raw JSON string (for JSON-LD) or serialized representation
569    pub raw_json: String,
570
571    /// Schema type if detectable (e.g., "Article", "Event", "Product")
572    pub schema_type: Option<String>,
573}
574
575/// Default maximum size for structured data extraction (1 MB)
576pub const DEFAULT_MAX_STRUCTURED_DATA_SIZE: usize = 1_000_000;
577
578/// Configuration for metadata extraction granularity.
579///
580/// Controls which metadata types are extracted and size limits for safety.
581///
582/// # Examples
583///
584/// ```
585/// # use html_to_markdown_rs::metadata::MetadataConfig;
586/// let config = MetadataConfig {
587///     extract_document: true,
588///     extract_headers: true,
589///     extract_links: true,
590///     extract_images: true,
591///     extract_structured_data: true,
592///     max_structured_data_size: 1_000_000,
593/// };
594///
595/// assert!(config.extract_headers);
596/// ```
597#[derive(Debug, Clone)]
598#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
599pub struct MetadataConfig {
600    /// Extract document-level metadata (title, description, author, etc.)
601    pub extract_document: bool,
602
603    /// Extract h1-h6 header elements and their hierarchy
604    pub extract_headers: bool,
605
606    /// Extract anchor (a) elements as links with type classification
607    pub extract_links: bool,
608
609    /// Extract image elements and data URIs
610    pub extract_images: bool,
611
612    /// Extract structured data (JSON-LD, Microdata, RDFa)
613    pub extract_structured_data: bool,
614
615    /// Maximum total size of structured data to collect (bytes)
616    /// Prevents memory exhaustion on malformed or adversarial documents
617    pub max_structured_data_size: usize,
618}
619
620/// Partial update for MetadataConfig.
621#[derive(Debug, Clone, Default)]
622#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
623#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
624pub struct MetadataConfigUpdate {
625    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_document"))]
626    pub extract_document: Option<bool>,
627    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_headers"))]
628    pub extract_headers: Option<bool>,
629    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_links"))]
630    pub extract_links: Option<bool>,
631    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_images"))]
632    pub extract_images: Option<bool>,
633    #[cfg_attr(
634        any(feature = "serde", feature = "metadata"),
635        serde(alias = "extract_structured_data")
636    )]
637    pub extract_structured_data: Option<bool>,
638    #[cfg_attr(
639        any(feature = "serde", feature = "metadata"),
640        serde(alias = "max_structured_data_size")
641    )]
642    pub max_structured_data_size: Option<usize>,
643}
644
645impl Default for MetadataConfig {
646    /// Create default metadata configuration.
647    ///
648    /// Defaults to extracting all metadata types with 1MB limit on structured data.
649    fn default() -> Self {
650        Self {
651            extract_document: true,
652            extract_headers: true,
653            extract_links: true,
654            extract_images: true,
655            extract_structured_data: true,
656            max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
657        }
658    }
659}
660
661impl MetadataConfig {
662    pub fn any_enabled(&self) -> bool {
663        self.extract_document
664            || self.extract_headers
665            || self.extract_links
666            || self.extract_images
667            || self.extract_structured_data
668    }
669
670    pub fn apply_update(&mut self, update: MetadataConfigUpdate) {
671        if let Some(extract_document) = update.extract_document {
672            self.extract_document = extract_document;
673        }
674        if let Some(extract_headers) = update.extract_headers {
675            self.extract_headers = extract_headers;
676        }
677        if let Some(extract_links) = update.extract_links {
678            self.extract_links = extract_links;
679        }
680        if let Some(extract_images) = update.extract_images {
681            self.extract_images = extract_images;
682        }
683        if let Some(extract_structured_data) = update.extract_structured_data {
684            self.extract_structured_data = extract_structured_data;
685        }
686        if let Some(max_structured_data_size) = update.max_structured_data_size {
687            self.max_structured_data_size = max_structured_data_size;
688        }
689    }
690
691    pub fn from_update(update: MetadataConfigUpdate) -> Self {
692        let mut config = Self::default();
693        config.apply_update(update);
694        config
695    }
696}
697
698impl From<MetadataConfigUpdate> for MetadataConfig {
699    fn from(update: MetadataConfigUpdate) -> Self {
700        Self::from_update(update)
701    }
702}
703
704/// Comprehensive metadata extraction result from HTML document.
705///
706/// Contains all extracted metadata types in a single structure,
707/// suitable for serialization and transmission across language boundaries.
708///
709/// # Examples
710///
711/// ```
712/// # use html_to_markdown_rs::metadata::ExtendedMetadata;
713/// let metadata = ExtendedMetadata {
714///     document: Default::default(),
715///     headers: Vec::new(),
716///     links: Vec::new(),
717///     images: Vec::new(),
718///     structured_data: Vec::new(),
719/// };
720///
721/// assert!(metadata.headers.is_empty());
722/// ```
723#[derive(Debug, Clone, Default)]
724#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
725pub struct ExtendedMetadata {
726    /// Document-level metadata (title, description, canonical, etc.)
727    pub document: DocumentMetadata,
728
729    /// Extracted header elements with hierarchy
730    pub headers: Vec<HeaderMetadata>,
731
732    /// Extracted hyperlinks with type classification
733    pub links: Vec<LinkMetadata>,
734
735    /// Extracted images with source and dimensions
736    pub images: Vec<ImageMetadata>,
737
738    /// Extracted structured data blocks
739    pub structured_data: Vec<StructuredData>,
740}
741
742/// Internal metadata collector for single-pass extraction.
743///
744/// Follows the [`InlineImageCollector`](crate::inline_images::InlineImageCollector) pattern
745/// for efficient metadata extraction during tree traversal. Maintains state for:
746/// - Document metadata from head elements
747/// - Header hierarchy tracking
748/// - Link accumulation
749/// - Structured data collection
750/// - Language and directionality attributes
751///
752/// # Architecture
753///
754/// The collector is designed to be:
755/// - **Performant**: Pre-allocated collections, minimal cloning
756/// - **Single-pass**: Collects during main tree walk without separate passes
757/// - **Optional**: Zero overhead when disabled via feature flags
758/// - **Type-safe**: Strict separation of collection and result types
759///
760/// # Internal State
761///
762/// - `head_metadata`: Raw metadata pairs from head element
763/// - `headers`: Collected header elements
764/// - `header_stack`: For tracking nesting depth
765/// - `links`: Collected link elements
766/// - `base_href`: Base URL for relative link resolution
767/// - `json_ld`: JSON-LD script block contents
768/// - `lang`: Document language
769/// - `dir`: Document text direction
770#[derive(Debug)]
771#[allow(dead_code)]
772pub(crate) struct MetadataCollector {
773    head_metadata: BTreeMap<String, String>,
774    headers: Vec<HeaderMetadata>,
775    header_stack: Vec<usize>,
776    links: Vec<LinkMetadata>,
777    images: Vec<ImageMetadata>,
778    json_ld: Vec<String>,
779    structured_data_size: usize,
780    config: MetadataConfig,
781    lang: Option<String>,
782    dir: Option<String>,
783}
784
785#[allow(dead_code)]
786impl MetadataCollector {
787    /// Create a new metadata collector with configuration.
788    ///
789    /// Pre-allocates collections based on typical document sizes
790    /// for efficient append operations during traversal.
791    ///
792    /// # Arguments
793    ///
794    /// * `config` - Extraction configuration specifying which types to collect
795    ///
796    /// # Returns
797    ///
798    /// A new collector ready for use during tree traversal.
799    ///
800    /// # Examples
801    ///
802    /// ```ignore
803    /// let config = MetadataConfig::default();
804    /// let collector = MetadataCollector::new(config);
805    /// ```
806    pub(crate) fn new(config: MetadataConfig) -> Self {
807        Self {
808            head_metadata: BTreeMap::new(),
809            headers: Vec::with_capacity(32),
810            header_stack: Vec::with_capacity(6),
811            links: Vec::with_capacity(64),
812            images: Vec::with_capacity(16),
813            json_ld: Vec::with_capacity(4),
814            structured_data_size: 0,
815            config,
816            lang: None,
817            dir: None,
818        }
819    }
820
821    /// Add a header element to the collection.
822    ///
823    /// Validates that level is in range 1-6 and tracks hierarchy via depth.
824    ///
825    /// # Arguments
826    ///
827    /// * `level` - Header level (1-6)
828    /// * `text` - Normalized header text content
829    /// * `id` - Optional HTML id attribute
830    /// * `depth` - Current document nesting depth
831    /// * `html_offset` - Byte offset in original HTML
832    pub(crate) fn add_header(&mut self, level: u8, text: String, id: Option<String>, depth: usize, html_offset: usize) {
833        if !self.config.extract_headers {
834            return;
835        }
836
837        if !(1..=6).contains(&level) {
838            return;
839        }
840
841        let header = HeaderMetadata {
842            level,
843            text,
844            id,
845            depth,
846            html_offset,
847        };
848
849        self.headers.push(header);
850    }
851
852    /// Add a link element to the collection.
853    ///
854    /// Classifies the link based on href value and stores with metadata.
855    ///
856    /// # Arguments
857    ///
858    /// * `href` - The href attribute value
859    /// * `text` - Link text content
860    /// * `title` - Optional title attribute
861    /// * `rel` - Comma/space-separated rel attribute value
862    /// * `attributes` - Additional attributes to capture (e.g., data-* or aria-* values)
863    pub(crate) fn add_link(
864        &mut self,
865        href: String,
866        text: String,
867        title: Option<String>,
868        rel: Option<String>,
869        attributes: BTreeMap<String, String>,
870    ) {
871        if !self.config.extract_links {
872            return;
873        }
874
875        let link_type = LinkMetadata::classify_link(&href);
876
877        let rel_vec = rel
878            .map(|r| r.split_whitespace().map(|s| s.to_string()).collect::<Vec<_>>())
879            .unwrap_or_default();
880
881        let link = LinkMetadata {
882            href,
883            text,
884            title,
885            link_type,
886            rel: rel_vec,
887            attributes,
888        };
889
890        self.links.push(link);
891    }
892
893    /// Add an image element to the collection.
894    ///
895    /// # Arguments
896    ///
897    /// * `src` - Image source (URL or data URI)
898    /// * `alt` - Optional alt text
899    /// * `title` - Optional title attribute
900    /// * `dimensions` - Optional (width, height) tuple
901    pub(crate) fn add_image(
902        &mut self,
903        src: String,
904        alt: Option<String>,
905        title: Option<String>,
906        dimensions: Option<(u32, u32)>,
907        attributes: BTreeMap<String, String>,
908    ) {
909        if !self.config.extract_images {
910            return;
911        }
912
913        let image_type = if src.starts_with("data:") {
914            ImageType::DataUri
915        } else if src.starts_with("http://") || src.starts_with("https://") {
916            ImageType::External
917        } else if src.starts_with('<') && src.contains("svg") {
918            ImageType::InlineSvg
919        } else {
920            ImageType::Relative
921        };
922
923        let image = ImageMetadata {
924            src,
925            alt,
926            title,
927            dimensions,
928            image_type,
929            attributes,
930        };
931
932        self.images.push(image);
933    }
934
935    /// Add a JSON-LD structured data block.
936    ///
937    /// Accumulates JSON content with size validation against configured limits.
938    ///
939    /// # Arguments
940    ///
941    /// * `json_content` - Raw JSON string content
942    pub(crate) fn add_json_ld(&mut self, json_content: String) {
943        if !self.config.extract_structured_data {
944            return;
945        }
946
947        let content_size = json_content.len();
948        if content_size > self.config.max_structured_data_size {
949            return;
950        }
951        if self.structured_data_size + content_size > self.config.max_structured_data_size {
952            return;
953        }
954
955        self.structured_data_size += content_size;
956        self.json_ld.push(json_content);
957    }
958
959    /// Set document head metadata from extracted head section.
960    ///
961    /// Merges metadata pairs from head elements (meta, title, link, etc.)
962    /// into the collector's head metadata store.
963    ///
964    /// # Arguments
965    ///
966    /// * `metadata` - BTreeMap of metadata key-value pairs
967    pub(crate) fn set_head_metadata(&mut self, metadata: BTreeMap<String, String>) {
968        if !self.config.extract_document {
969            return;
970        }
971        self.head_metadata.extend(metadata);
972    }
973
974    /// Set document language attribute.
975    ///
976    /// Usually from `lang` attribute on `<html>` or `<body>` tag.
977    /// Only sets if not already set (first occurrence wins).
978    ///
979    /// # Arguments
980    ///
981    /// * `lang` - Language code (e.g., "en", "es", "fr")
982    pub(crate) fn set_language(&mut self, lang: String) {
983        if !self.config.extract_document {
984            return;
985        }
986        if self.lang.is_none() {
987            self.lang = Some(lang);
988        }
989    }
990
991    /// Set document text direction attribute.
992    ///
993    /// Usually from `dir` attribute on `<html>` or `<body>` tag.
994    /// Only sets if not already set (first occurrence wins).
995    ///
996    /// # Arguments
997    ///
998    /// * `dir` - Direction string ("ltr", "rtl", or "auto")
999    pub(crate) fn set_text_direction(&mut self, dir: String) {
1000        if !self.config.extract_document {
1001            return;
1002        }
1003        if self.dir.is_none() {
1004            self.dir = Some(dir);
1005        }
1006    }
1007
1008    pub(crate) fn wants_document(&self) -> bool {
1009        self.config.extract_document
1010    }
1011
1012    pub(crate) fn wants_headers(&self) -> bool {
1013        self.config.extract_headers
1014    }
1015
1016    pub(crate) fn wants_links(&self) -> bool {
1017        self.config.extract_links
1018    }
1019
1020    pub(crate) fn wants_images(&self) -> bool {
1021        self.config.extract_images
1022    }
1023
1024    pub(crate) fn wants_structured_data(&self) -> bool {
1025        self.config.extract_structured_data
1026    }
1027
1028    /// Extract document metadata from collected head metadata.
1029    ///
1030    /// Parses head metadata into structured document metadata,
1031    /// handling special cases like Open Graph, Twitter Card, keywords, etc.
1032    #[allow(dead_code)]
1033    fn extract_document_metadata(
1034        head_metadata: BTreeMap<String, String>,
1035        lang: Option<String>,
1036        dir: Option<String>,
1037    ) -> DocumentMetadata {
1038        let mut doc = DocumentMetadata::default();
1039
1040        for (raw_key, value) in head_metadata {
1041            let mut key = raw_key.as_str();
1042            let mut replaced_key: Option<String> = None;
1043
1044            if let Some(stripped) = key.strip_prefix("meta-") {
1045                key = stripped;
1046            }
1047
1048            if key.as_bytes().contains(&b':') {
1049                replaced_key = Some(key.replace(':', "-"));
1050                key = replaced_key.as_deref().unwrap_or(key);
1051            }
1052
1053            match key {
1054                "title" => doc.title = Some(value),
1055                "description" => doc.description = Some(value),
1056                "author" => doc.author = Some(value),
1057                "canonical" => doc.canonical_url = Some(value),
1058                "base" | "base-href" => doc.base_href = Some(value),
1059                key if key.starts_with("og-") => {
1060                    let og_key = if key.as_bytes().contains(&b'-') {
1061                        key.trim_start_matches("og-").replace('-', "_")
1062                    } else {
1063                        key.trim_start_matches("og-").to_string()
1064                    };
1065                    doc.open_graph.insert(og_key, value);
1066                }
1067                key if key.starts_with("twitter-") => {
1068                    let tw_key = if key.as_bytes().contains(&b'-') {
1069                        key.trim_start_matches("twitter-").replace('-', "_")
1070                    } else {
1071                        key.trim_start_matches("twitter-").to_string()
1072                    };
1073                    doc.twitter_card.insert(tw_key, value);
1074                }
1075                "keywords" => {
1076                    doc.keywords = value
1077                        .split(',')
1078                        .map(|s| s.trim().to_string())
1079                        .filter(|s| !s.is_empty())
1080                        .collect();
1081                }
1082                _ => {
1083                    let meta_key = if key.as_ptr() == raw_key.as_ptr() && key.len() == raw_key.len() {
1084                        raw_key
1085                    } else if let Some(replaced) = replaced_key {
1086                        replaced
1087                    } else {
1088                        key.to_string()
1089                    };
1090                    doc.meta_tags.insert(meta_key, value);
1091                }
1092            }
1093        }
1094
1095        if let Some(lang) = lang {
1096            doc.language = Some(lang);
1097        }
1098
1099        if let Some(dir) = dir {
1100            if let Some(parsed_dir) = TextDirection::parse(&dir) {
1101                doc.text_direction = Some(parsed_dir);
1102            }
1103        }
1104
1105        doc
1106    }
1107
1108    /// Extract structured data blocks into StructuredData items.
1109    #[allow(dead_code)]
1110    fn extract_structured_data(json_ld: Vec<String>) -> Vec<StructuredData> {
1111        let mut result = Vec::with_capacity(json_ld.len());
1112
1113        for json_str in json_ld {
1114            let schema_type = Self::scan_schema_type(&json_str)
1115                .or_else(|| {
1116                    if json_str.contains("\"@type\"") {
1117                        serde_json::from_str::<serde_json::Value>(&json_str)
1118                            .ok()
1119                            .and_then(|v| v.get("@type").and_then(|t| t.as_str().map(|s| s.to_string())))
1120                    } else {
1121                        None
1122                    }
1123                })
1124                .or_else(|| {
1125                    if !json_str.contains("\"@graph\"") {
1126                        return None;
1127                    }
1128
1129                    let value = serde_json::from_str::<serde_json::Value>(&json_str).ok()?;
1130                    let graph = value.get("@graph")?;
1131                    let items = graph.as_array()?;
1132                    items
1133                        .iter()
1134                        .find_map(|item| item.get("@type").and_then(|t| t.as_str().map(|s| s.to_string())))
1135                });
1136
1137            result.push(StructuredData {
1138                data_type: StructuredDataType::JsonLd,
1139                raw_json: json_str,
1140                schema_type,
1141            });
1142        }
1143
1144        result
1145    }
1146
1147    fn scan_schema_type(json_str: &str) -> Option<String> {
1148        let needle = "\"@type\"";
1149        let start = json_str.find(needle)? + needle.len();
1150        let bytes = json_str.as_bytes();
1151        let mut i = start;
1152
1153        while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1154            i += 1;
1155        }
1156        if i >= bytes.len() || bytes[i] != b':' {
1157            return None;
1158        }
1159        i += 1;
1160        while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1161            i += 1;
1162        }
1163        if i >= bytes.len() {
1164            return None;
1165        }
1166
1167        if bytes[i] == b'[' {
1168            i += 1;
1169            while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1170                i += 1;
1171            }
1172            if i >= bytes.len() || bytes[i] != b'"' {
1173                return None;
1174            }
1175        } else if bytes[i] != b'"' {
1176            return None;
1177        }
1178
1179        let start_quote = i;
1180        i += 1;
1181        let mut escaped = false;
1182        while i < bytes.len() {
1183            let byte = bytes[i];
1184            if escaped {
1185                escaped = false;
1186                i += 1;
1187                continue;
1188            }
1189            if byte == b'\\' {
1190                escaped = true;
1191                i += 1;
1192                continue;
1193            }
1194            if byte == b'"' {
1195                let end_quote = i;
1196                let slice = &json_str[start_quote..=end_quote];
1197                return serde_json::from_str::<String>(slice).ok();
1198            }
1199            i += 1;
1200        }
1201
1202        None
1203    }
1204
1205    /// Finish collection and return all extracted metadata.
1206    ///
1207    /// Performs final processing, validation, and consolidation of all
1208    /// collected data into the [`ExtendedMetadata`] output structure.
1209    ///
1210    /// # Returns
1211    ///
1212    /// Complete [`ExtendedMetadata`] with all extracted information.
1213    #[allow(dead_code)]
1214    pub(crate) fn finish(self) -> ExtendedMetadata {
1215        let structured_data = Self::extract_structured_data(self.json_ld);
1216        let document = Self::extract_document_metadata(self.head_metadata, self.lang, self.dir);
1217
1218        ExtendedMetadata {
1219            document,
1220            headers: self.headers,
1221            links: self.links,
1222            images: self.images,
1223            structured_data,
1224        }
1225    }
1226
1227    /// Categorize links by type for analysis and filtering.
1228    ///
1229    /// Separates collected links into groups by [`LinkType`].
1230    /// This is an analysis helper method; actual categorization happens during add_link.
1231    ///
1232    /// # Returns
1233    ///
1234    /// BTreeMap with LinkType as key and Vec of matching LinkMetadata as value.
1235    #[allow(dead_code)]
1236    pub(crate) fn categorize_links(&self) -> BTreeMap<String, Vec<&LinkMetadata>> {
1237        let mut categorized: BTreeMap<String, Vec<&LinkMetadata>> = BTreeMap::new();
1238
1239        for link in &self.links {
1240            let category = link.link_type.to_string();
1241            categorized.entry(category).or_default().push(link);
1242        }
1243
1244        categorized
1245    }
1246
1247    /// Count headers by level for structural analysis.
1248    ///
1249    /// Returns count of headers at each level (1-6).
1250    ///
1251    /// # Returns
1252    ///
1253    /// BTreeMap with level as string key and count as value.
1254    #[allow(dead_code)]
1255    pub(crate) fn header_counts(&self) -> BTreeMap<String, usize> {
1256        let mut counts: BTreeMap<String, usize> = BTreeMap::new();
1257
1258        for header in &self.headers {
1259            *counts.entry(header.level.to_string()).or_insert(0) += 1;
1260        }
1261
1262        counts
1263    }
1264}
1265
1266/// Handle to a metadata collector via reference-counted mutable cell.
1267///
1268/// Used internally for sharing collector state across the tree traversal.
1269/// Matches the pattern used for [`InlineImageCollector`](crate::inline_images::InlineImageCollector).
1270///
1271/// # Examples
1272///
1273/// ```ignore
1274/// let collector = MetadataCollector::new(MetadataConfig::default());
1275/// let handle = Rc::new(RefCell::new(collector));
1276///
1277/// // In tree walk, can be passed and borrowed
1278/// handle.borrow_mut().add_header(1, "Title".to_string(), None, 0, 100);
1279///
1280/// let metadata = handle.take().finish();
1281/// ```
1282#[allow(dead_code)]
1283pub(crate) type MetadataCollectorHandle = Rc<RefCell<MetadataCollector>>;
1284
1285#[cfg(test)]
1286mod tests {
1287    use super::*;
1288
1289    #[test]
1290    fn test_text_direction_parse() {
1291        assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
1292        assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
1293        assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
1294        assert_eq!(TextDirection::parse("invalid"), None);
1295        assert_eq!(TextDirection::parse("LTR"), Some(TextDirection::LeftToRight));
1296    }
1297
1298    #[test]
1299    fn test_text_direction_display() {
1300        assert_eq!(TextDirection::LeftToRight.to_string(), "ltr");
1301        assert_eq!(TextDirection::RightToLeft.to_string(), "rtl");
1302        assert_eq!(TextDirection::Auto.to_string(), "auto");
1303    }
1304
1305    #[test]
1306    fn test_link_classification() {
1307        assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
1308        assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
1309        assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
1310        assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
1311        assert_eq!(LinkMetadata::classify_link("http://example.com"), LinkType::External);
1312        assert_eq!(LinkMetadata::classify_link("/path/to/page"), LinkType::Internal);
1313        assert_eq!(LinkMetadata::classify_link("../relative"), LinkType::Internal);
1314        assert_eq!(LinkMetadata::classify_link("./same"), LinkType::Internal);
1315    }
1316
1317    #[test]
1318    fn test_header_validation() {
1319        let valid = HeaderMetadata {
1320            level: 3,
1321            text: "Title".to_string(),
1322            id: None,
1323            depth: 2,
1324            html_offset: 100,
1325        };
1326        assert!(valid.is_valid());
1327
1328        let invalid_high = HeaderMetadata {
1329            level: 7,
1330            text: "Title".to_string(),
1331            id: None,
1332            depth: 2,
1333            html_offset: 100,
1334        };
1335        assert!(!invalid_high.is_valid());
1336
1337        let invalid_low = HeaderMetadata {
1338            level: 0,
1339            text: "Title".to_string(),
1340            id: None,
1341            depth: 2,
1342            html_offset: 100,
1343        };
1344        assert!(!invalid_low.is_valid());
1345    }
1346
1347    #[test]
1348    fn test_metadata_collector_new() {
1349        let config = MetadataConfig::default();
1350        let collector = MetadataCollector::new(config);
1351
1352        assert_eq!(collector.headers.capacity(), 32);
1353        assert_eq!(collector.links.capacity(), 64);
1354        assert_eq!(collector.images.capacity(), 16);
1355        assert_eq!(collector.json_ld.capacity(), 4);
1356    }
1357
1358    #[test]
1359    fn test_metadata_collector_add_header() {
1360        let config = MetadataConfig::default();
1361        let mut collector = MetadataCollector::new(config);
1362
1363        collector.add_header(1, "Title".to_string(), Some("title".to_string()), 0, 100);
1364        assert_eq!(collector.headers.len(), 1);
1365
1366        let header = &collector.headers[0];
1367        assert_eq!(header.level, 1);
1368        assert_eq!(header.text, "Title");
1369        assert_eq!(header.id, Some("title".to_string()));
1370
1371        collector.add_header(7, "Invalid".to_string(), None, 0, 200);
1372        assert_eq!(collector.headers.len(), 1);
1373    }
1374
1375    #[test]
1376    fn test_metadata_collector_add_link() {
1377        let config = MetadataConfig::default();
1378        let mut collector = MetadataCollector::new(config);
1379
1380        collector.add_link(
1381            "https://example.com".to_string(),
1382            "Example".to_string(),
1383            Some("Visit".to_string()),
1384            Some("nofollow external".to_string()),
1385            BTreeMap::from([("data-id".to_string(), "example".to_string())]),
1386        );
1387
1388        assert_eq!(collector.links.len(), 1);
1389
1390        let link = &collector.links[0];
1391        assert_eq!(link.href, "https://example.com");
1392        assert_eq!(link.text, "Example");
1393        assert_eq!(link.link_type, LinkType::External);
1394        assert_eq!(link.rel, vec!["nofollow", "external"]);
1395        assert_eq!(link.attributes.get("data-id"), Some(&"example".to_string()));
1396    }
1397
1398    #[test]
1399    fn test_metadata_collector_respects_config() {
1400        let config = MetadataConfig {
1401            extract_document: false,
1402            extract_headers: false,
1403            extract_links: false,
1404            extract_images: false,
1405            extract_structured_data: false,
1406            max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
1407        };
1408        let mut collector = MetadataCollector::new(config);
1409
1410        collector.add_header(1, "Title".to_string(), None, 0, 100);
1411        collector.add_link(
1412            "https://example.com".to_string(),
1413            "Link".to_string(),
1414            None,
1415            None,
1416            BTreeMap::new(),
1417        );
1418        collector.add_image(
1419            "https://example.com/img.jpg".to_string(),
1420            None,
1421            None,
1422            None,
1423            BTreeMap::new(),
1424        );
1425        collector.add_json_ld("{}".to_string());
1426
1427        assert!(collector.headers.is_empty());
1428        assert!(collector.links.is_empty());
1429        assert!(collector.images.is_empty());
1430        assert!(collector.json_ld.is_empty());
1431    }
1432
1433    #[test]
1434    fn test_metadata_collector_finish() {
1435        let config = MetadataConfig::default();
1436        let mut collector = MetadataCollector::new(config);
1437
1438        collector.set_language("en".to_string());
1439        collector.add_header(1, "Main Title".to_string(), None, 0, 100);
1440        collector.add_link(
1441            "https://example.com".to_string(),
1442            "Example".to_string(),
1443            None,
1444            None,
1445            BTreeMap::new(),
1446        );
1447
1448        let metadata = collector.finish();
1449
1450        assert_eq!(metadata.document.language, Some("en".to_string()));
1451        assert_eq!(metadata.headers.len(), 1);
1452        assert_eq!(metadata.links.len(), 1);
1453    }
1454
1455    #[test]
1456    fn test_document_metadata_default() {
1457        let doc = DocumentMetadata::default();
1458
1459        assert!(doc.title.is_none());
1460        assert!(doc.description.is_none());
1461        assert!(doc.keywords.is_empty());
1462        assert!(doc.open_graph.is_empty());
1463        assert!(doc.twitter_card.is_empty());
1464        assert!(doc.meta_tags.is_empty());
1465    }
1466
1467    #[test]
1468    fn test_metadata_config_default() {
1469        let config = MetadataConfig::default();
1470
1471        assert!(config.extract_headers);
1472        assert!(config.extract_links);
1473        assert!(config.extract_images);
1474        assert!(config.extract_structured_data);
1475        assert_eq!(config.max_structured_data_size, DEFAULT_MAX_STRUCTURED_DATA_SIZE);
1476    }
1477
1478    #[test]
1479    fn test_image_type_classification() {
1480        let data_uri = ImageMetadata {
1481            src: "data:image/png;base64,iVBORw0KG...".to_string(),
1482            alt: None,
1483            title: None,
1484            dimensions: None,
1485            image_type: ImageType::DataUri,
1486            attributes: BTreeMap::new(),
1487        };
1488        assert_eq!(data_uri.image_type, ImageType::DataUri);
1489
1490        let external = ImageMetadata {
1491            src: "https://example.com/image.jpg".to_string(),
1492            alt: None,
1493            title: None,
1494            dimensions: None,
1495            image_type: ImageType::External,
1496            attributes: BTreeMap::new(),
1497        };
1498        assert_eq!(external.image_type, ImageType::External);
1499    }
1500
1501    #[test]
1502    fn test_link_type_display() {
1503        assert_eq!(LinkType::Anchor.to_string(), "anchor");
1504        assert_eq!(LinkType::Internal.to_string(), "internal");
1505        assert_eq!(LinkType::External.to_string(), "external");
1506        assert_eq!(LinkType::Email.to_string(), "email");
1507        assert_eq!(LinkType::Phone.to_string(), "phone");
1508        assert_eq!(LinkType::Other.to_string(), "other");
1509    }
1510
1511    #[test]
1512    fn test_structured_data_type_display() {
1513        assert_eq!(StructuredDataType::JsonLd.to_string(), "json_ld");
1514        assert_eq!(StructuredDataType::Microdata.to_string(), "microdata");
1515        assert_eq!(StructuredDataType::RDFa.to_string(), "rdfa");
1516    }
1517
1518    #[test]
1519    fn test_categorize_links() {
1520        let config = MetadataConfig::default();
1521        let mut collector = MetadataCollector::new(config);
1522
1523        collector.add_link("#anchor".to_string(), "Anchor".to_string(), None, None, BTreeMap::new());
1524        collector.add_link(
1525            "https://example.com".to_string(),
1526            "External".to_string(),
1527            None,
1528            None,
1529            BTreeMap::new(),
1530        );
1531        collector.add_link(
1532            "mailto:test@example.com".to_string(),
1533            "Email".to_string(),
1534            None,
1535            None,
1536            BTreeMap::new(),
1537        );
1538
1539        let categorized = collector.categorize_links();
1540
1541        assert_eq!(categorized.get("anchor").map(|v| v.len()), Some(1));
1542        assert_eq!(categorized.get("external").map(|v| v.len()), Some(1));
1543        assert_eq!(categorized.get("email").map(|v| v.len()), Some(1));
1544    }
1545
1546    #[test]
1547    fn test_header_counts() {
1548        let config = MetadataConfig::default();
1549        let mut collector = MetadataCollector::new(config);
1550
1551        collector.add_header(1, "H1".to_string(), None, 0, 100);
1552        collector.add_header(2, "H2".to_string(), None, 1, 200);
1553        collector.add_header(2, "H2b".to_string(), None, 1, 300);
1554        collector.add_header(3, "H3".to_string(), None, 2, 400);
1555
1556        let counts = collector.header_counts();
1557
1558        assert_eq!(counts.get("1").copied(), Some(1));
1559        assert_eq!(counts.get("2").copied(), Some(2));
1560        assert_eq!(counts.get("3").copied(), Some(1));
1561    }
1562}
html_to_markdown_rs/metadata.rs

html_to_markdown_rs/
metadata.rs