html_to_markdown_rs/metadata.rs
1//! Metadata extraction for HTML to Markdown conversion.
2//!
3//! This module provides comprehensive, type-safe metadata extraction during HTML-to-Markdown
4//! conversion, enabling content analysis, SEO optimization, and document indexing workflows.
5//! Metadata includes:
6//! - **Document metadata**: Title, description, author, language, canonical URL, Open Graph, Twitter Card
7//! - **Headers**: Heading elements (h1-h6) with hierarchy, IDs, and positions
8//! - **Links**: Hyperlinks with type classification (anchor, internal, external, email, phone)
9//! - **Images**: Image elements with source, alt text, dimensions, and type (data URI, external, etc.)
10//! - **Structured data**: JSON-LD, Microdata, and RDFa blocks
11//!
12//! The implementation follows a single-pass collector pattern for zero-overhead extraction
13//! when metadata features are disabled.
14//!
15//! # Architecture
16//!
17//! Metadata extraction uses the [`MetadataCollector`] pattern (similar to [`InlineImageCollector`]):
18//! - **Single-pass collection**: Metadata is gathered during the primary tree traversal without additional passes
19//! - **Zero overhead when disabled**: Entire module can be compiled out via feature flags
20//! - **Configurable granularity**: Use [`MetadataConfig`] to select which metadata types to extract
21//! - **Type-safe APIs**: All metadata types are enum-based with exhaustive matching
22//! - **Memory-bounded**: Size limits prevent memory exhaustion from adversarial documents
23//! - **Pre-allocated buffers**: Typical documents (32 headers, 64 links, 16 images) handled efficiently
24//!
25//! # Type Overview
26//!
27//! ## Enumerations
28//!
29//! - [`TextDirection`]: Document directionality (LTR, RTL, Auto)
30//! - [`LinkType`]: Link classification (Anchor, Internal, External, Email, Phone, Other)
31//! - [`ImageType`]: Image source type (DataUri, External, Relative, InlineSvg)
32//! - [`StructuredDataType`]: Structured data format (JsonLd, Microdata, RDFa)
33//!
34//! ## Structures
35//!
36//! - [`DocumentMetadata`]: Head-level metadata with maps for Open Graph and Twitter Card
37//! - [`HeaderMetadata`]: Heading element with level (1-6), text, ID, hierarchy depth, and position
38//! - [`LinkMetadata`]: Hyperlink with href, text, title, type, rel attributes, and custom attributes
39//! - [`ImageMetadata`]: Image element with src, alt, title, dimensions, type, and attributes
40//! - [`StructuredData`]: Structured data block with type and raw JSON
41//! - [`MetadataConfig`]: Configuration controlling extraction granularity and size limits
42//! - [`ExtendedMetadata`]: Top-level result containing all extracted metadata
43//!
44//! # Examples
45//!
46//! ## Basic Usage with convert_with_metadata
47//!
48//! ```ignore
49//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
50//!
51//! let html = r#"
52//! <html lang="en">
53//! <head>
54//! <title>My Article</title>
55//! <meta name="description" content="An interesting read">
56//! </head>
57//! <body>
58//! <h1 id="main">Title</h1>
59//! <a href="https://example.com">External Link</a>
60//! <img src="photo.jpg" alt="A photo">
61//! </body>
62//! </html>
63//! "#;
64//!
65//! let config = MetadataConfig::default();
66//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
67//!
68//! // Access document metadata
69//! assert_eq!(metadata.document.title, Some("My Article".to_string()));
70//! assert_eq!(metadata.document.language, Some("en".to_string()));
71//!
72//! // Access headers
73//! assert_eq!(metadata.headers.len(), 1);
74//! assert_eq!(metadata.headers[0].level, 1);
75//! assert_eq!(metadata.headers[0].id, Some("main".to_string()));
76//!
77//! // Access links
78//! assert_eq!(metadata.links.len(), 1);
79//! assert_eq!(metadata.links[0].link_type, LinkType::External);
80//!
81//! // Access images
82//! assert_eq!(metadata.images.len(), 1);
83//! assert_eq!(metadata.images[0].image_type, ImageType::Relative);
84//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
85//! ```
86//!
87//! ## Selective Extraction
88//!
89//! ```ignore
90//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
91//!
92//! let config = MetadataConfig {
93//! extract_headers: true,
94//! extract_links: true,
95//! extract_images: false, // Skip images
96//! extract_structured_data: false, // Skip structured data
97//! max_structured_data_size: 0,
98//! };
99//!
100//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
101//! assert_eq!(metadata.images.len(), 0); // Images not extracted
102//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
103//! ```
104//!
105//! ## Analyzing Link Types
106//!
107//! ```ignore
108//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
109//! use html_to_markdown_rs::metadata::LinkType;
110//!
111//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
112//!
113//! for link in &metadata.links {
114//! match link.link_type {
115//! LinkType::External => println!("External: {}", link.href),
116//! LinkType::Internal => println!("Internal: {}", link.href),
117//! LinkType::Anchor => println!("Anchor: {}", link.href),
118//! LinkType::Email => println!("Email: {}", link.href),
119//! _ => {}
120//! }
121//! }
122//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
123//! ```
124//!
125//! # Serialization
126//!
127//! All types in this module support serialization via `serde` when the `metadata` feature is enabled.
128//! This enables easy export to JSON, YAML, or other formats:
129//!
130//! ```ignore
131//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
132//!
133//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
134//! let json = serde_json::to_string_pretty(&metadata)?;
135//! println!("{}", json);
136//! # Ok::<(), Box<dyn std::error::Error>>(())
137//! ```
138
139use std::cell::RefCell;
140use std::collections::BTreeMap;
141use std::rc::Rc;
142
143/// Text directionality of document content.
144///
145/// Corresponds to the HTML `dir` attribute and `bdi` element directionality.
146#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
147#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
148pub enum TextDirection {
149 /// Left-to-right text flow (default for Latin scripts)
150 #[cfg_attr(feature = "metadata", serde(rename = "ltr"))]
151 LeftToRight,
152 /// Right-to-left text flow (Hebrew, Arabic, Urdu, etc.)
153 #[cfg_attr(feature = "metadata", serde(rename = "rtl"))]
154 RightToLeft,
155 /// Automatic directionality detection
156 #[cfg_attr(feature = "metadata", serde(rename = "auto"))]
157 Auto,
158}
159
160impl std::fmt::Display for TextDirection {
161 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162 match self {
163 Self::LeftToRight => write!(f, "ltr"),
164 Self::RightToLeft => write!(f, "rtl"),
165 Self::Auto => write!(f, "auto"),
166 }
167 }
168}
169
170impl TextDirection {
171 /// Parse a text direction from string value.
172 ///
173 /// # Arguments
174 ///
175 /// * `s` - Direction string ("ltr", "rtl", or "auto")
176 ///
177 /// # Returns
178 ///
179 /// `Some(TextDirection)` if valid, `None` otherwise.
180 ///
181 /// # Examples
182 ///
183 /// ```
184 /// # use html_to_markdown_rs::metadata::TextDirection;
185 /// assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
186 /// assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
187 /// assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
188 /// assert_eq!(TextDirection::parse("invalid"), None);
189 /// ```
190 pub fn parse(s: &str) -> Option<Self> {
191 if s.eq_ignore_ascii_case("ltr") {
192 return Some(Self::LeftToRight);
193 }
194 if s.eq_ignore_ascii_case("rtl") {
195 return Some(Self::RightToLeft);
196 }
197 if s.eq_ignore_ascii_case("auto") {
198 return Some(Self::Auto);
199 }
200 None
201 }
202}
203
204/// Link classification based on href value and document context.
205///
206/// Used to categorize links during extraction for filtering and analysis.
207#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
208#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
209#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
210pub enum LinkType {
211 /// Anchor link within same document (href starts with #)
212 Anchor,
213 /// Internal link within same domain
214 Internal,
215 /// External link to different domain
216 External,
217 /// Email link (mailto:)
218 Email,
219 /// Phone link (tel:)
220 Phone,
221 /// Other protocol or unclassifiable
222 Other,
223}
224
225impl std::fmt::Display for LinkType {
226 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
227 match self {
228 Self::Anchor => write!(f, "anchor"),
229 Self::Internal => write!(f, "internal"),
230 Self::External => write!(f, "external"),
231 Self::Email => write!(f, "email"),
232 Self::Phone => write!(f, "phone"),
233 Self::Other => write!(f, "other"),
234 }
235 }
236}
237
238/// Image source classification for proper handling and processing.
239///
240/// Determines whether an image is embedded (data URI), inline SVG, external, or relative.
241#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
242#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
243#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
244pub enum ImageType {
245 /// Data URI embedded image (base64 or other encoding)
246 DataUri,
247 /// Inline SVG element
248 InlineSvg,
249 /// External image URL (http/https)
250 External,
251 /// Relative image path
252 Relative,
253}
254
255impl std::fmt::Display for ImageType {
256 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
257 match self {
258 Self::DataUri => write!(f, "data_uri"),
259 Self::InlineSvg => write!(f, "inline_svg"),
260 Self::External => write!(f, "external"),
261 Self::Relative => write!(f, "relative"),
262 }
263 }
264}
265
266/// Structured data format type.
267///
268/// Identifies the schema/format used for structured data markup.
269#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
270#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
271#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
272pub enum StructuredDataType {
273 /// JSON-LD (JSON for Linking Data) script blocks
274 #[cfg_attr(feature = "metadata", serde(rename = "json_ld"))]
275 JsonLd,
276 /// HTML5 Microdata attributes (itemscope, itemtype, itemprop)
277 Microdata,
278 /// RDF in Attributes (RDFa) markup
279 #[cfg_attr(feature = "metadata", serde(rename = "rdfa"))]
280 RDFa,
281}
282
283impl std::fmt::Display for StructuredDataType {
284 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285 match self {
286 Self::JsonLd => write!(f, "json_ld"),
287 Self::Microdata => write!(f, "microdata"),
288 Self::RDFa => write!(f, "rdfa"),
289 }
290 }
291}
292
293/// Document-level metadata extracted from `<head>` and top-level elements.
294///
295/// Contains all metadata typically used by search engines, social media platforms,
296/// and browsers for document indexing and presentation.
297///
298/// # Examples
299///
300/// ```
301/// # use html_to_markdown_rs::metadata::DocumentMetadata;
302/// let doc = DocumentMetadata {
303/// title: Some("My Article".to_string()),
304/// description: Some("A great article about Rust".to_string()),
305/// keywords: vec!["rust".to_string(), "programming".to_string()],
306/// ..Default::default()
307/// };
308///
309/// assert_eq!(doc.title, Some("My Article".to_string()));
310/// ```
311#[derive(Debug, Clone, Default)]
312#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
313pub struct DocumentMetadata {
314 /// Document title from `<title>` tag
315 pub title: Option<String>,
316
317 /// Document description from `<meta name="description">` tag
318 pub description: Option<String>,
319
320 /// Document keywords from `<meta name="keywords">` tag, split on commas
321 pub keywords: Vec<String>,
322
323 /// Document author from `<meta name="author">` tag
324 pub author: Option<String>,
325
326 /// Canonical URL from `<link rel="canonical">` tag
327 pub canonical_url: Option<String>,
328
329 /// Base URL from `<base href="">` tag for resolving relative URLs
330 pub base_href: Option<String>,
331
332 /// Document language from `lang` attribute
333 pub language: Option<String>,
334
335 /// Document text direction from `dir` attribute
336 pub text_direction: Option<TextDirection>,
337
338 /// Open Graph metadata (og:* properties) for social media
339 /// Keys like "title", "description", "image", "url", etc.
340 pub open_graph: BTreeMap<String, String>,
341
342 /// Twitter Card metadata (twitter:* properties)
343 /// Keys like "card", "site", "creator", "title", "description", "image", etc.
344 pub twitter_card: BTreeMap<String, String>,
345
346 /// Additional meta tags not covered by specific fields
347 /// Keys are meta name/property attributes, values are content
348 pub meta_tags: BTreeMap<String, String>,
349}
350
351/// Header element metadata with hierarchy tracking.
352///
353/// Captures heading elements (h1-h6) with their text content, identifiers,
354/// and position in the document structure.
355///
356/// # Examples
357///
358/// ```
359/// # use html_to_markdown_rs::metadata::HeaderMetadata;
360/// let header = HeaderMetadata {
361/// level: 1,
362/// text: "Main Title".to_string(),
363/// id: Some("main-title".to_string()),
364/// depth: 0,
365/// html_offset: 145,
366/// };
367///
368/// assert_eq!(header.level, 1);
369/// assert!(header.is_valid());
370/// ```
371#[derive(Debug, Clone)]
372#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
373pub struct HeaderMetadata {
374 /// Header level: 1 (h1) through 6 (h6)
375 pub level: u8,
376
377 /// Normalized text content of the header
378 pub text: String,
379
380 /// HTML id attribute if present
381 pub id: Option<String>,
382
383 /// Document tree depth at the header element
384 pub depth: usize,
385
386 /// Byte offset in original HTML document
387 pub html_offset: usize,
388}
389
390impl HeaderMetadata {
391 /// Validate that the header level is within valid range (1-6).
392 ///
393 /// # Returns
394 ///
395 /// `true` if level is 1-6, `false` otherwise.
396 ///
397 /// # Examples
398 ///
399 /// ```
400 /// # use html_to_markdown_rs::metadata::HeaderMetadata;
401 /// let valid = HeaderMetadata {
402 /// level: 3,
403 /// text: "Title".to_string(),
404 /// id: None,
405 /// depth: 2,
406 /// html_offset: 100,
407 /// };
408 /// assert!(valid.is_valid());
409 ///
410 /// let invalid = HeaderMetadata {
411 /// level: 7, // Invalid
412 /// text: "Title".to_string(),
413 /// id: None,
414 /// depth: 2,
415 /// html_offset: 100,
416 /// };
417 /// assert!(!invalid.is_valid());
418 /// ```
419 pub fn is_valid(&self) -> bool {
420 self.level >= 1 && self.level <= 6
421 }
422}
423
424/// Hyperlink metadata with categorization and attributes.
425///
426/// Represents `<a>` elements with parsed href values, text content, and link type classification.
427///
428/// # Examples
429///
430/// ```
431/// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
432/// let link = LinkMetadata {
433/// href: "https://example.com".to_string(),
434/// text: "Example".to_string(),
435/// title: Some("Visit Example".to_string()),
436/// link_type: LinkType::External,
437/// rel: vec!["nofollow".to_string()],
438/// attributes: Default::default(),
439/// };
440///
441/// assert_eq!(link.link_type, LinkType::External);
442/// assert_eq!(link.text, "Example");
443/// ```
444#[derive(Debug, Clone)]
445#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
446pub struct LinkMetadata {
447 /// The href URL value
448 pub href: String,
449
450 /// Link text content (normalized, concatenated if mixed with elements)
451 pub text: String,
452
453 /// Optional title attribute (often shown as tooltip)
454 pub title: Option<String>,
455
456 /// Link type classification
457 pub link_type: LinkType,
458
459 /// Rel attribute values (e.g., "nofollow", "stylesheet", "canonical")
460 pub rel: Vec<String>,
461
462 /// Additional HTML attributes
463 pub attributes: BTreeMap<String, String>,
464}
465
466impl LinkMetadata {
467 /// Classify a link based on href value.
468 ///
469 /// # Arguments
470 ///
471 /// * `href` - The href attribute value
472 ///
473 /// # Returns
474 ///
475 /// Appropriate [`LinkType`] based on protocol and content.
476 ///
477 /// # Examples
478 ///
479 /// ```
480 /// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
481 /// assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
482 /// assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
483 /// assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
484 /// assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
485 /// ```
486 pub fn classify_link(href: &str) -> LinkType {
487 if href.starts_with('#') {
488 LinkType::Anchor
489 } else if href.starts_with("mailto:") {
490 LinkType::Email
491 } else if href.starts_with("tel:") {
492 LinkType::Phone
493 } else if href.starts_with("http://") || href.starts_with("https://") {
494 LinkType::External
495 } else if href.starts_with('/') || href.starts_with("../") || href.starts_with("./") {
496 LinkType::Internal
497 } else {
498 LinkType::Other
499 }
500 }
501}
502
503/// Image metadata with source and dimensions.
504///
505/// Captures `<img>` elements and inline `<svg>` elements with metadata
506/// for image analysis and optimization.
507///
508/// # Examples
509///
510/// ```
511/// # use html_to_markdown_rs::metadata::{ImageMetadata, ImageType};
512/// let img = ImageMetadata {
513/// src: "https://example.com/image.jpg".to_string(),
514/// alt: Some("An example image".to_string()),
515/// title: Some("Example".to_string()),
516/// dimensions: Some((800, 600)),
517/// image_type: ImageType::External,
518/// attributes: Default::default(),
519/// };
520///
521/// assert_eq!(img.image_type, ImageType::External);
522/// ```
523#[derive(Debug, Clone)]
524#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
525pub struct ImageMetadata {
526 /// Image source (URL, data URI, or SVG content identifier)
527 pub src: String,
528
529 /// Alternative text from alt attribute (for accessibility)
530 pub alt: Option<String>,
531
532 /// Title attribute (often shown as tooltip)
533 pub title: Option<String>,
534
535 /// Image dimensions as (width, height) if available
536 pub dimensions: Option<(u32, u32)>,
537
538 /// Image type classification
539 pub image_type: ImageType,
540
541 /// Additional HTML attributes
542 pub attributes: BTreeMap<String, String>,
543}
544
545/// Structured data block (JSON-LD, Microdata, or RDFa).
546///
547/// Represents machine-readable structured data found in the document.
548/// JSON-LD blocks are collected as raw JSON strings for flexibility.
549///
550/// # Examples
551///
552/// ```
553/// # use html_to_markdown_rs::metadata::{StructuredData, StructuredDataType};
554/// let schema = StructuredData {
555/// data_type: StructuredDataType::JsonLd,
556/// raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
557/// schema_type: Some("Article".to_string()),
558/// };
559///
560/// assert_eq!(schema.data_type, StructuredDataType::JsonLd);
561/// ```
562#[derive(Debug, Clone)]
563#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
564pub struct StructuredData {
565 /// Type of structured data (JSON-LD, Microdata, RDFa)
566 pub data_type: StructuredDataType,
567
568 /// Raw JSON string (for JSON-LD) or serialized representation
569 pub raw_json: String,
570
571 /// Schema type if detectable (e.g., "Article", "Event", "Product")
572 pub schema_type: Option<String>,
573}
574
575/// Default maximum size for structured data extraction (1 MB)
576pub const DEFAULT_MAX_STRUCTURED_DATA_SIZE: usize = 1_000_000;
577
578/// Configuration for metadata extraction granularity.
579///
580/// Controls which metadata types are extracted and size limits for safety.
581///
582/// # Examples
583///
584/// ```
585/// # use html_to_markdown_rs::metadata::MetadataConfig;
586/// let config = MetadataConfig {
587/// extract_document: true,
588/// extract_headers: true,
589/// extract_links: true,
590/// extract_images: true,
591/// extract_structured_data: true,
592/// max_structured_data_size: 1_000_000,
593/// };
594///
595/// assert!(config.extract_headers);
596/// ```
597#[derive(Debug, Clone)]
598#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
599pub struct MetadataConfig {
600 /// Extract document-level metadata (title, description, author, etc.)
601 pub extract_document: bool,
602
603 /// Extract h1-h6 header elements and their hierarchy
604 pub extract_headers: bool,
605
606 /// Extract anchor (a) elements as links with type classification
607 pub extract_links: bool,
608
609 /// Extract image elements and data URIs
610 pub extract_images: bool,
611
612 /// Extract structured data (JSON-LD, Microdata, RDFa)
613 pub extract_structured_data: bool,
614
615 /// Maximum total size of structured data to collect (bytes)
616 /// Prevents memory exhaustion on malformed or adversarial documents
617 pub max_structured_data_size: usize,
618}
619
620/// Partial update for MetadataConfig.
621#[derive(Debug, Clone, Default)]
622#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
623#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
624pub struct MetadataConfigUpdate {
625 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_document"))]
626 pub extract_document: Option<bool>,
627 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_headers"))]
628 pub extract_headers: Option<bool>,
629 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_links"))]
630 pub extract_links: Option<bool>,
631 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_images"))]
632 pub extract_images: Option<bool>,
633 #[cfg_attr(
634 any(feature = "serde", feature = "metadata"),
635 serde(alias = "extract_structured_data")
636 )]
637 pub extract_structured_data: Option<bool>,
638 #[cfg_attr(
639 any(feature = "serde", feature = "metadata"),
640 serde(alias = "max_structured_data_size")
641 )]
642 pub max_structured_data_size: Option<usize>,
643}
644
645impl Default for MetadataConfig {
646 /// Create default metadata configuration.
647 ///
648 /// Defaults to extracting all metadata types with 1MB limit on structured data.
649 fn default() -> Self {
650 Self {
651 extract_document: true,
652 extract_headers: true,
653 extract_links: true,
654 extract_images: true,
655 extract_structured_data: true,
656 max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
657 }
658 }
659}
660
661impl MetadataConfig {
662 pub fn any_enabled(&self) -> bool {
663 self.extract_document
664 || self.extract_headers
665 || self.extract_links
666 || self.extract_images
667 || self.extract_structured_data
668 }
669
670 pub fn apply_update(&mut self, update: MetadataConfigUpdate) {
671 if let Some(extract_document) = update.extract_document {
672 self.extract_document = extract_document;
673 }
674 if let Some(extract_headers) = update.extract_headers {
675 self.extract_headers = extract_headers;
676 }
677 if let Some(extract_links) = update.extract_links {
678 self.extract_links = extract_links;
679 }
680 if let Some(extract_images) = update.extract_images {
681 self.extract_images = extract_images;
682 }
683 if let Some(extract_structured_data) = update.extract_structured_data {
684 self.extract_structured_data = extract_structured_data;
685 }
686 if let Some(max_structured_data_size) = update.max_structured_data_size {
687 self.max_structured_data_size = max_structured_data_size;
688 }
689 }
690
691 pub fn from_update(update: MetadataConfigUpdate) -> Self {
692 let mut config = Self::default();
693 config.apply_update(update);
694 config
695 }
696}
697
698impl From<MetadataConfigUpdate> for MetadataConfig {
699 fn from(update: MetadataConfigUpdate) -> Self {
700 Self::from_update(update)
701 }
702}
703
704/// Comprehensive metadata extraction result from HTML document.
705///
706/// Contains all extracted metadata types in a single structure,
707/// suitable for serialization and transmission across language boundaries.
708///
709/// # Examples
710///
711/// ```
712/// # use html_to_markdown_rs::metadata::ExtendedMetadata;
713/// let metadata = ExtendedMetadata {
714/// document: Default::default(),
715/// headers: Vec::new(),
716/// links: Vec::new(),
717/// images: Vec::new(),
718/// structured_data: Vec::new(),
719/// };
720///
721/// assert!(metadata.headers.is_empty());
722/// ```
723#[derive(Debug, Clone, Default)]
724#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
725pub struct ExtendedMetadata {
726 /// Document-level metadata (title, description, canonical, etc.)
727 pub document: DocumentMetadata,
728
729 /// Extracted header elements with hierarchy
730 pub headers: Vec<HeaderMetadata>,
731
732 /// Extracted hyperlinks with type classification
733 pub links: Vec<LinkMetadata>,
734
735 /// Extracted images with source and dimensions
736 pub images: Vec<ImageMetadata>,
737
738 /// Extracted structured data blocks
739 pub structured_data: Vec<StructuredData>,
740}
741
742/// Internal metadata collector for single-pass extraction.
743///
744/// Follows the [`InlineImageCollector`](crate::inline_images::InlineImageCollector) pattern
745/// for efficient metadata extraction during tree traversal. Maintains state for:
746/// - Document metadata from head elements
747/// - Header hierarchy tracking
748/// - Link accumulation
749/// - Structured data collection
750/// - Language and directionality attributes
751///
752/// # Architecture
753///
754/// The collector is designed to be:
755/// - **Performant**: Pre-allocated collections, minimal cloning
756/// - **Single-pass**: Collects during main tree walk without separate passes
757/// - **Optional**: Zero overhead when disabled via feature flags
758/// - **Type-safe**: Strict separation of collection and result types
759///
760/// # Internal State
761///
762/// - `head_metadata`: Raw metadata pairs from head element
763/// - `headers`: Collected header elements
764/// - `header_stack`: For tracking nesting depth
765/// - `links`: Collected link elements
766/// - `base_href`: Base URL for relative link resolution
767/// - `json_ld`: JSON-LD script block contents
768/// - `lang`: Document language
769/// - `dir`: Document text direction
770#[derive(Debug)]
771#[allow(dead_code)]
772pub(crate) struct MetadataCollector {
773 head_metadata: BTreeMap<String, String>,
774 headers: Vec<HeaderMetadata>,
775 header_stack: Vec<usize>,
776 links: Vec<LinkMetadata>,
777 images: Vec<ImageMetadata>,
778 json_ld: Vec<String>,
779 structured_data_size: usize,
780 config: MetadataConfig,
781 lang: Option<String>,
782 dir: Option<String>,
783}
784
785#[allow(dead_code)]
786impl MetadataCollector {
787 /// Create a new metadata collector with configuration.
788 ///
789 /// Pre-allocates collections based on typical document sizes
790 /// for efficient append operations during traversal.
791 ///
792 /// # Arguments
793 ///
794 /// * `config` - Extraction configuration specifying which types to collect
795 ///
796 /// # Returns
797 ///
798 /// A new collector ready for use during tree traversal.
799 ///
800 /// # Examples
801 ///
802 /// ```ignore
803 /// let config = MetadataConfig::default();
804 /// let collector = MetadataCollector::new(config);
805 /// ```
806 pub(crate) fn new(config: MetadataConfig) -> Self {
807 Self {
808 head_metadata: BTreeMap::new(),
809 headers: Vec::with_capacity(32),
810 header_stack: Vec::with_capacity(6),
811 links: Vec::with_capacity(64),
812 images: Vec::with_capacity(16),
813 json_ld: Vec::with_capacity(4),
814 structured_data_size: 0,
815 config,
816 lang: None,
817 dir: None,
818 }
819 }
820
821 /// Add a header element to the collection.
822 ///
823 /// Validates that level is in range 1-6 and tracks hierarchy via depth.
824 ///
825 /// # Arguments
826 ///
827 /// * `level` - Header level (1-6)
828 /// * `text` - Normalized header text content
829 /// * `id` - Optional HTML id attribute
830 /// * `depth` - Current document nesting depth
831 /// * `html_offset` - Byte offset in original HTML
832 pub(crate) fn add_header(&mut self, level: u8, text: String, id: Option<String>, depth: usize, html_offset: usize) {
833 if !self.config.extract_headers {
834 return;
835 }
836
837 if !(1..=6).contains(&level) {
838 return;
839 }
840
841 let header = HeaderMetadata {
842 level,
843 text,
844 id,
845 depth,
846 html_offset,
847 };
848
849 self.headers.push(header);
850 }
851
852 /// Add a link element to the collection.
853 ///
854 /// Classifies the link based on href value and stores with metadata.
855 ///
856 /// # Arguments
857 ///
858 /// * `href` - The href attribute value
859 /// * `text` - Link text content
860 /// * `title` - Optional title attribute
861 /// * `rel` - Comma/space-separated rel attribute value
862 /// * `attributes` - Additional attributes to capture (e.g., data-* or aria-* values)
863 pub(crate) fn add_link(
864 &mut self,
865 href: String,
866 text: String,
867 title: Option<String>,
868 rel: Option<String>,
869 attributes: BTreeMap<String, String>,
870 ) {
871 if !self.config.extract_links {
872 return;
873 }
874
875 let link_type = LinkMetadata::classify_link(&href);
876
877 let rel_vec = rel
878 .map(|r| r.split_whitespace().map(|s| s.to_string()).collect::<Vec<_>>())
879 .unwrap_or_default();
880
881 let link = LinkMetadata {
882 href,
883 text,
884 title,
885 link_type,
886 rel: rel_vec,
887 attributes,
888 };
889
890 self.links.push(link);
891 }
892
893 /// Add an image element to the collection.
894 ///
895 /// # Arguments
896 ///
897 /// * `src` - Image source (URL or data URI)
898 /// * `alt` - Optional alt text
899 /// * `title` - Optional title attribute
900 /// * `dimensions` - Optional (width, height) tuple
901 pub(crate) fn add_image(
902 &mut self,
903 src: String,
904 alt: Option<String>,
905 title: Option<String>,
906 dimensions: Option<(u32, u32)>,
907 attributes: BTreeMap<String, String>,
908 ) {
909 if !self.config.extract_images {
910 return;
911 }
912
913 let image_type = if src.starts_with("data:") {
914 ImageType::DataUri
915 } else if src.starts_with("http://") || src.starts_with("https://") {
916 ImageType::External
917 } else if src.starts_with('<') && src.contains("svg") {
918 ImageType::InlineSvg
919 } else {
920 ImageType::Relative
921 };
922
923 let image = ImageMetadata {
924 src,
925 alt,
926 title,
927 dimensions,
928 image_type,
929 attributes,
930 };
931
932 self.images.push(image);
933 }
934
935 /// Add a JSON-LD structured data block.
936 ///
937 /// Accumulates JSON content with size validation against configured limits.
938 ///
939 /// # Arguments
940 ///
941 /// * `json_content` - Raw JSON string content
942 pub(crate) fn add_json_ld(&mut self, json_content: String) {
943 if !self.config.extract_structured_data {
944 return;
945 }
946
947 let content_size = json_content.len();
948 if content_size > self.config.max_structured_data_size {
949 return;
950 }
951 if self.structured_data_size + content_size > self.config.max_structured_data_size {
952 return;
953 }
954
955 self.structured_data_size += content_size;
956 self.json_ld.push(json_content);
957 }
958
959 /// Set document head metadata from extracted head section.
960 ///
961 /// Merges metadata pairs from head elements (meta, title, link, etc.)
962 /// into the collector's head metadata store.
963 ///
964 /// # Arguments
965 ///
966 /// * `metadata` - BTreeMap of metadata key-value pairs
967 pub(crate) fn set_head_metadata(&mut self, metadata: BTreeMap<String, String>) {
968 if !self.config.extract_document {
969 return;
970 }
971 self.head_metadata.extend(metadata);
972 }
973
974 /// Set document language attribute.
975 ///
976 /// Usually from `lang` attribute on `<html>` or `<body>` tag.
977 /// Only sets if not already set (first occurrence wins).
978 ///
979 /// # Arguments
980 ///
981 /// * `lang` - Language code (e.g., "en", "es", "fr")
982 pub(crate) fn set_language(&mut self, lang: String) {
983 if !self.config.extract_document {
984 return;
985 }
986 if self.lang.is_none() {
987 self.lang = Some(lang);
988 }
989 }
990
991 /// Set document text direction attribute.
992 ///
993 /// Usually from `dir` attribute on `<html>` or `<body>` tag.
994 /// Only sets if not already set (first occurrence wins).
995 ///
996 /// # Arguments
997 ///
998 /// * `dir` - Direction string ("ltr", "rtl", or "auto")
999 pub(crate) fn set_text_direction(&mut self, dir: String) {
1000 if !self.config.extract_document {
1001 return;
1002 }
1003 if self.dir.is_none() {
1004 self.dir = Some(dir);
1005 }
1006 }
1007
1008 pub(crate) fn wants_document(&self) -> bool {
1009 self.config.extract_document
1010 }
1011
1012 pub(crate) fn wants_headers(&self) -> bool {
1013 self.config.extract_headers
1014 }
1015
1016 pub(crate) fn wants_links(&self) -> bool {
1017 self.config.extract_links
1018 }
1019
1020 pub(crate) fn wants_images(&self) -> bool {
1021 self.config.extract_images
1022 }
1023
1024 pub(crate) fn wants_structured_data(&self) -> bool {
1025 self.config.extract_structured_data
1026 }
1027
1028 /// Extract document metadata from collected head metadata.
1029 ///
1030 /// Parses head metadata into structured document metadata,
1031 /// handling special cases like Open Graph, Twitter Card, keywords, etc.
1032 #[allow(dead_code)]
1033 fn extract_document_metadata(
1034 head_metadata: BTreeMap<String, String>,
1035 lang: Option<String>,
1036 dir: Option<String>,
1037 ) -> DocumentMetadata {
1038 let mut doc = DocumentMetadata::default();
1039
1040 for (raw_key, value) in head_metadata {
1041 let mut key = raw_key.as_str();
1042 let mut replaced_key: Option<String> = None;
1043
1044 if let Some(stripped) = key.strip_prefix("meta-") {
1045 key = stripped;
1046 }
1047
1048 if key.as_bytes().contains(&b':') {
1049 replaced_key = Some(key.replace(':', "-"));
1050 key = replaced_key.as_deref().unwrap_or(key);
1051 }
1052
1053 match key {
1054 "title" => doc.title = Some(value),
1055 "description" => doc.description = Some(value),
1056 "author" => doc.author = Some(value),
1057 "canonical" => doc.canonical_url = Some(value),
1058 "base" | "base-href" => doc.base_href = Some(value),
1059 key if key.starts_with("og-") => {
1060 let og_key = if key.as_bytes().contains(&b'-') {
1061 key.trim_start_matches("og-").replace('-', "_")
1062 } else {
1063 key.trim_start_matches("og-").to_string()
1064 };
1065 doc.open_graph.insert(og_key, value);
1066 }
1067 key if key.starts_with("twitter-") => {
1068 let tw_key = if key.as_bytes().contains(&b'-') {
1069 key.trim_start_matches("twitter-").replace('-', "_")
1070 } else {
1071 key.trim_start_matches("twitter-").to_string()
1072 };
1073 doc.twitter_card.insert(tw_key, value);
1074 }
1075 "keywords" => {
1076 doc.keywords = value
1077 .split(',')
1078 .map(|s| s.trim().to_string())
1079 .filter(|s| !s.is_empty())
1080 .collect();
1081 }
1082 _ => {
1083 let meta_key = if key.as_ptr() == raw_key.as_ptr() && key.len() == raw_key.len() {
1084 raw_key
1085 } else if let Some(replaced) = replaced_key {
1086 replaced
1087 } else {
1088 key.to_string()
1089 };
1090 doc.meta_tags.insert(meta_key, value);
1091 }
1092 }
1093 }
1094
1095 if let Some(lang) = lang {
1096 doc.language = Some(lang);
1097 }
1098
1099 if let Some(dir) = dir {
1100 if let Some(parsed_dir) = TextDirection::parse(&dir) {
1101 doc.text_direction = Some(parsed_dir);
1102 }
1103 }
1104
1105 doc
1106 }
1107
1108 /// Extract structured data blocks into StructuredData items.
1109 #[allow(dead_code)]
1110 fn extract_structured_data(json_ld: Vec<String>) -> Vec<StructuredData> {
1111 let mut result = Vec::with_capacity(json_ld.len());
1112
1113 for json_str in json_ld {
1114 let schema_type = Self::scan_schema_type(&json_str)
1115 .or_else(|| {
1116 if json_str.contains("\"@type\"") {
1117 serde_json::from_str::<serde_json::Value>(&json_str)
1118 .ok()
1119 .and_then(|v| v.get("@type").and_then(|t| t.as_str().map(|s| s.to_string())))
1120 } else {
1121 None
1122 }
1123 })
1124 .or_else(|| {
1125 if !json_str.contains("\"@graph\"") {
1126 return None;
1127 }
1128
1129 let value = serde_json::from_str::<serde_json::Value>(&json_str).ok()?;
1130 let graph = value.get("@graph")?;
1131 let items = graph.as_array()?;
1132 items
1133 .iter()
1134 .find_map(|item| item.get("@type").and_then(|t| t.as_str().map(|s| s.to_string())))
1135 });
1136
1137 result.push(StructuredData {
1138 data_type: StructuredDataType::JsonLd,
1139 raw_json: json_str,
1140 schema_type,
1141 });
1142 }
1143
1144 result
1145 }
1146
1147 fn scan_schema_type(json_str: &str) -> Option<String> {
1148 let needle = "\"@type\"";
1149 let start = json_str.find(needle)? + needle.len();
1150 let bytes = json_str.as_bytes();
1151 let mut i = start;
1152
1153 while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1154 i += 1;
1155 }
1156 if i >= bytes.len() || bytes[i] != b':' {
1157 return None;
1158 }
1159 i += 1;
1160 while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1161 i += 1;
1162 }
1163 if i >= bytes.len() {
1164 return None;
1165 }
1166
1167 if bytes[i] == b'[' {
1168 i += 1;
1169 while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1170 i += 1;
1171 }
1172 if i >= bytes.len() || bytes[i] != b'"' {
1173 return None;
1174 }
1175 } else if bytes[i] != b'"' {
1176 return None;
1177 }
1178
1179 let start_quote = i;
1180 i += 1;
1181 let mut escaped = false;
1182 while i < bytes.len() {
1183 let byte = bytes[i];
1184 if escaped {
1185 escaped = false;
1186 i += 1;
1187 continue;
1188 }
1189 if byte == b'\\' {
1190 escaped = true;
1191 i += 1;
1192 continue;
1193 }
1194 if byte == b'"' {
1195 let end_quote = i;
1196 let slice = &json_str[start_quote..=end_quote];
1197 return serde_json::from_str::<String>(slice).ok();
1198 }
1199 i += 1;
1200 }
1201
1202 None
1203 }
1204
1205 /// Finish collection and return all extracted metadata.
1206 ///
1207 /// Performs final processing, validation, and consolidation of all
1208 /// collected data into the [`ExtendedMetadata`] output structure.
1209 ///
1210 /// # Returns
1211 ///
1212 /// Complete [`ExtendedMetadata`] with all extracted information.
1213 #[allow(dead_code)]
1214 pub(crate) fn finish(self) -> ExtendedMetadata {
1215 let structured_data = Self::extract_structured_data(self.json_ld);
1216 let document = Self::extract_document_metadata(self.head_metadata, self.lang, self.dir);
1217
1218 ExtendedMetadata {
1219 document,
1220 headers: self.headers,
1221 links: self.links,
1222 images: self.images,
1223 structured_data,
1224 }
1225 }
1226
1227 /// Categorize links by type for analysis and filtering.
1228 ///
1229 /// Separates collected links into groups by [`LinkType`].
1230 /// This is an analysis helper method; actual categorization happens during add_link.
1231 ///
1232 /// # Returns
1233 ///
1234 /// BTreeMap with LinkType as key and Vec of matching LinkMetadata as value.
1235 #[allow(dead_code)]
1236 pub(crate) fn categorize_links(&self) -> BTreeMap<String, Vec<&LinkMetadata>> {
1237 let mut categorized: BTreeMap<String, Vec<&LinkMetadata>> = BTreeMap::new();
1238
1239 for link in &self.links {
1240 let category = link.link_type.to_string();
1241 categorized.entry(category).or_default().push(link);
1242 }
1243
1244 categorized
1245 }
1246
1247 /// Count headers by level for structural analysis.
1248 ///
1249 /// Returns count of headers at each level (1-6).
1250 ///
1251 /// # Returns
1252 ///
1253 /// BTreeMap with level as string key and count as value.
1254 #[allow(dead_code)]
1255 pub(crate) fn header_counts(&self) -> BTreeMap<String, usize> {
1256 let mut counts: BTreeMap<String, usize> = BTreeMap::new();
1257
1258 for header in &self.headers {
1259 *counts.entry(header.level.to_string()).or_insert(0) += 1;
1260 }
1261
1262 counts
1263 }
1264}
1265
1266/// Handle to a metadata collector via reference-counted mutable cell.
1267///
1268/// Used internally for sharing collector state across the tree traversal.
1269/// Matches the pattern used for [`InlineImageCollector`](crate::inline_images::InlineImageCollector).
1270///
1271/// # Examples
1272///
1273/// ```ignore
1274/// let collector = MetadataCollector::new(MetadataConfig::default());
1275/// let handle = Rc::new(RefCell::new(collector));
1276///
1277/// // In tree walk, can be passed and borrowed
1278/// handle.borrow_mut().add_header(1, "Title".to_string(), None, 0, 100);
1279///
1280/// let metadata = handle.take().finish();
1281/// ```
1282#[allow(dead_code)]
1283pub(crate) type MetadataCollectorHandle = Rc<RefCell<MetadataCollector>>;
1284
1285#[cfg(test)]
1286mod tests {
1287 use super::*;
1288
1289 #[test]
1290 fn test_text_direction_parse() {
1291 assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
1292 assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
1293 assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
1294 assert_eq!(TextDirection::parse("invalid"), None);
1295 assert_eq!(TextDirection::parse("LTR"), Some(TextDirection::LeftToRight));
1296 }
1297
1298 #[test]
1299 fn test_text_direction_display() {
1300 assert_eq!(TextDirection::LeftToRight.to_string(), "ltr");
1301 assert_eq!(TextDirection::RightToLeft.to_string(), "rtl");
1302 assert_eq!(TextDirection::Auto.to_string(), "auto");
1303 }
1304
1305 #[test]
1306 fn test_link_classification() {
1307 assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
1308 assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
1309 assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
1310 assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
1311 assert_eq!(LinkMetadata::classify_link("http://example.com"), LinkType::External);
1312 assert_eq!(LinkMetadata::classify_link("/path/to/page"), LinkType::Internal);
1313 assert_eq!(LinkMetadata::classify_link("../relative"), LinkType::Internal);
1314 assert_eq!(LinkMetadata::classify_link("./same"), LinkType::Internal);
1315 }
1316
1317 #[test]
1318 fn test_header_validation() {
1319 let valid = HeaderMetadata {
1320 level: 3,
1321 text: "Title".to_string(),
1322 id: None,
1323 depth: 2,
1324 html_offset: 100,
1325 };
1326 assert!(valid.is_valid());
1327
1328 let invalid_high = HeaderMetadata {
1329 level: 7,
1330 text: "Title".to_string(),
1331 id: None,
1332 depth: 2,
1333 html_offset: 100,
1334 };
1335 assert!(!invalid_high.is_valid());
1336
1337 let invalid_low = HeaderMetadata {
1338 level: 0,
1339 text: "Title".to_string(),
1340 id: None,
1341 depth: 2,
1342 html_offset: 100,
1343 };
1344 assert!(!invalid_low.is_valid());
1345 }
1346
1347 #[test]
1348 fn test_metadata_collector_new() {
1349 let config = MetadataConfig::default();
1350 let collector = MetadataCollector::new(config);
1351
1352 assert_eq!(collector.headers.capacity(), 32);
1353 assert_eq!(collector.links.capacity(), 64);
1354 assert_eq!(collector.images.capacity(), 16);
1355 assert_eq!(collector.json_ld.capacity(), 4);
1356 }
1357
1358 #[test]
1359 fn test_metadata_collector_add_header() {
1360 let config = MetadataConfig::default();
1361 let mut collector = MetadataCollector::new(config);
1362
1363 collector.add_header(1, "Title".to_string(), Some("title".to_string()), 0, 100);
1364 assert_eq!(collector.headers.len(), 1);
1365
1366 let header = &collector.headers[0];
1367 assert_eq!(header.level, 1);
1368 assert_eq!(header.text, "Title");
1369 assert_eq!(header.id, Some("title".to_string()));
1370
1371 collector.add_header(7, "Invalid".to_string(), None, 0, 200);
1372 assert_eq!(collector.headers.len(), 1);
1373 }
1374
1375 #[test]
1376 fn test_metadata_collector_add_link() {
1377 let config = MetadataConfig::default();
1378 let mut collector = MetadataCollector::new(config);
1379
1380 collector.add_link(
1381 "https://example.com".to_string(),
1382 "Example".to_string(),
1383 Some("Visit".to_string()),
1384 Some("nofollow external".to_string()),
1385 BTreeMap::from([("data-id".to_string(), "example".to_string())]),
1386 );
1387
1388 assert_eq!(collector.links.len(), 1);
1389
1390 let link = &collector.links[0];
1391 assert_eq!(link.href, "https://example.com");
1392 assert_eq!(link.text, "Example");
1393 assert_eq!(link.link_type, LinkType::External);
1394 assert_eq!(link.rel, vec!["nofollow", "external"]);
1395 assert_eq!(link.attributes.get("data-id"), Some(&"example".to_string()));
1396 }
1397
1398 #[test]
1399 fn test_metadata_collector_respects_config() {
1400 let config = MetadataConfig {
1401 extract_document: false,
1402 extract_headers: false,
1403 extract_links: false,
1404 extract_images: false,
1405 extract_structured_data: false,
1406 max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
1407 };
1408 let mut collector = MetadataCollector::new(config);
1409
1410 collector.add_header(1, "Title".to_string(), None, 0, 100);
1411 collector.add_link(
1412 "https://example.com".to_string(),
1413 "Link".to_string(),
1414 None,
1415 None,
1416 BTreeMap::new(),
1417 );
1418 collector.add_image(
1419 "https://example.com/img.jpg".to_string(),
1420 None,
1421 None,
1422 None,
1423 BTreeMap::new(),
1424 );
1425 collector.add_json_ld("{}".to_string());
1426
1427 assert!(collector.headers.is_empty());
1428 assert!(collector.links.is_empty());
1429 assert!(collector.images.is_empty());
1430 assert!(collector.json_ld.is_empty());
1431 }
1432
1433 #[test]
1434 fn test_metadata_collector_finish() {
1435 let config = MetadataConfig::default();
1436 let mut collector = MetadataCollector::new(config);
1437
1438 collector.set_language("en".to_string());
1439 collector.add_header(1, "Main Title".to_string(), None, 0, 100);
1440 collector.add_link(
1441 "https://example.com".to_string(),
1442 "Example".to_string(),
1443 None,
1444 None,
1445 BTreeMap::new(),
1446 );
1447
1448 let metadata = collector.finish();
1449
1450 assert_eq!(metadata.document.language, Some("en".to_string()));
1451 assert_eq!(metadata.headers.len(), 1);
1452 assert_eq!(metadata.links.len(), 1);
1453 }
1454
1455 #[test]
1456 fn test_document_metadata_default() {
1457 let doc = DocumentMetadata::default();
1458
1459 assert!(doc.title.is_none());
1460 assert!(doc.description.is_none());
1461 assert!(doc.keywords.is_empty());
1462 assert!(doc.open_graph.is_empty());
1463 assert!(doc.twitter_card.is_empty());
1464 assert!(doc.meta_tags.is_empty());
1465 }
1466
1467 #[test]
1468 fn test_metadata_config_default() {
1469 let config = MetadataConfig::default();
1470
1471 assert!(config.extract_headers);
1472 assert!(config.extract_links);
1473 assert!(config.extract_images);
1474 assert!(config.extract_structured_data);
1475 assert_eq!(config.max_structured_data_size, DEFAULT_MAX_STRUCTURED_DATA_SIZE);
1476 }
1477
1478 #[test]
1479 fn test_image_type_classification() {
1480 let data_uri = ImageMetadata {
1481 src: "data:image/png;base64,iVBORw0KG...".to_string(),
1482 alt: None,
1483 title: None,
1484 dimensions: None,
1485 image_type: ImageType::DataUri,
1486 attributes: BTreeMap::new(),
1487 };
1488 assert_eq!(data_uri.image_type, ImageType::DataUri);
1489
1490 let external = ImageMetadata {
1491 src: "https://example.com/image.jpg".to_string(),
1492 alt: None,
1493 title: None,
1494 dimensions: None,
1495 image_type: ImageType::External,
1496 attributes: BTreeMap::new(),
1497 };
1498 assert_eq!(external.image_type, ImageType::External);
1499 }
1500
1501 #[test]
1502 fn test_link_type_display() {
1503 assert_eq!(LinkType::Anchor.to_string(), "anchor");
1504 assert_eq!(LinkType::Internal.to_string(), "internal");
1505 assert_eq!(LinkType::External.to_string(), "external");
1506 assert_eq!(LinkType::Email.to_string(), "email");
1507 assert_eq!(LinkType::Phone.to_string(), "phone");
1508 assert_eq!(LinkType::Other.to_string(), "other");
1509 }
1510
1511 #[test]
1512 fn test_structured_data_type_display() {
1513 assert_eq!(StructuredDataType::JsonLd.to_string(), "json_ld");
1514 assert_eq!(StructuredDataType::Microdata.to_string(), "microdata");
1515 assert_eq!(StructuredDataType::RDFa.to_string(), "rdfa");
1516 }
1517
1518 #[test]
1519 fn test_categorize_links() {
1520 let config = MetadataConfig::default();
1521 let mut collector = MetadataCollector::new(config);
1522
1523 collector.add_link("#anchor".to_string(), "Anchor".to_string(), None, None, BTreeMap::new());
1524 collector.add_link(
1525 "https://example.com".to_string(),
1526 "External".to_string(),
1527 None,
1528 None,
1529 BTreeMap::new(),
1530 );
1531 collector.add_link(
1532 "mailto:test@example.com".to_string(),
1533 "Email".to_string(),
1534 None,
1535 None,
1536 BTreeMap::new(),
1537 );
1538
1539 let categorized = collector.categorize_links();
1540
1541 assert_eq!(categorized.get("anchor").map(|v| v.len()), Some(1));
1542 assert_eq!(categorized.get("external").map(|v| v.len()), Some(1));
1543 assert_eq!(categorized.get("email").map(|v| v.len()), Some(1));
1544 }
1545
1546 #[test]
1547 fn test_header_counts() {
1548 let config = MetadataConfig::default();
1549 let mut collector = MetadataCollector::new(config);
1550
1551 collector.add_header(1, "H1".to_string(), None, 0, 100);
1552 collector.add_header(2, "H2".to_string(), None, 1, 200);
1553 collector.add_header(2, "H2b".to_string(), None, 1, 300);
1554 collector.add_header(3, "H3".to_string(), None, 2, 400);
1555
1556 let counts = collector.header_counts();
1557
1558 assert_eq!(counts.get("1").copied(), Some(1));
1559 assert_eq!(counts.get("2").copied(), Some(2));
1560 assert_eq!(counts.get("3").copied(), Some(1));
1561 }
1562}