html_to_markdown_rs/metadata.rs
1#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
2//! Metadata extraction for HTML to Markdown conversion.
3//!
4//! This module provides comprehensive, type-safe metadata extraction during HTML-to-Markdown
5//! conversion, enabling content analysis, SEO optimization, and document indexing workflows.
6//! Metadata includes:
7//! - **Document metadata**: Title, description, author, language, canonical URL, Open Graph, Twitter Card
8//! - **Headers**: Heading elements (h1-h6) with hierarchy, IDs, and positions
9//! - **Links**: Hyperlinks with type classification (anchor, internal, external, email, phone)
10//! - **Images**: Image elements with source, alt text, dimensions, and type (data URI, external, etc.)
11//! - **Structured data**: JSON-LD, Microdata, and `RDFa` blocks
12//!
13//! The implementation follows a single-pass collector pattern for zero-overhead extraction
14//! when metadata features are disabled.
15//!
16//! # Architecture
17//!
18//! Metadata extraction uses the [`MetadataCollector`] pattern (similar to [`InlineImageCollector`]):
19//! - **Single-pass collection**: Metadata is gathered during the primary tree traversal without additional passes
20//! - **Zero overhead when disabled**: Entire module can be compiled out via feature flags
21//! - **Configurable granularity**: Use [`MetadataConfig`] to select which metadata types to extract
22//! - **Type-safe APIs**: All metadata types are enum-based with exhaustive matching
23//! - **Memory-bounded**: Size limits prevent memory exhaustion from adversarial documents
24//! - **Pre-allocated buffers**: Typical documents (32 headers, 64 links, 16 images) handled efficiently
25//!
26//! # Type Overview
27//!
28//! ## Enumerations
29//!
30//! - [`TextDirection`]: Document directionality (LTR, RTL, Auto)
31//! - [`LinkType`]: Link classification (Anchor, Internal, External, Email, Phone, Other)
32//! - [`ImageType`]: Image source type (`DataUri`, External, Relative, `InlineSvg`)
33//! - [`StructuredDataType`]: Structured data format (`JsonLd`, Microdata, `RDFa`)
34//!
35//! ## Structures
36//!
37//! - [`DocumentMetadata`]: Head-level metadata with maps for Open Graph and Twitter Card
38//! - [`HeaderMetadata`]: Heading element with level (1-6), text, ID, hierarchy depth, and position
39//! - [`LinkMetadata`]: Hyperlink with href, text, title, type, rel attributes, and custom attributes
40//! - [`ImageMetadata`]: Image element with src, alt, title, dimensions, type, and attributes
41//! - [`StructuredData`]: Structured data block with type and raw JSON
42//! - [`MetadataConfig`]: Configuration controlling extraction granularity and size limits
43//! - [`ExtendedMetadata`]: Top-level result containing all extracted metadata
44//!
45//! # Examples
46//!
47//! ## Basic Usage with `convert_with_metadata`
48//!
49//! ```ignore
50//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
51//!
52//! let html = r#"
53//! <html lang="en">
54//! <head>
55//! <title>My Article</title>
56//! <meta name="description" content="An interesting read">
57//! </head>
58//! <body>
59//! <h1 id="main">Title</h1>
60//! <a href="https://example.com">External Link</a>
61//! <img src="photo.jpg" alt="A photo">
62//! </body>
63//! </html>
64//! "#;
65//!
66//! let config = MetadataConfig::default();
67//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
68//!
69//! // Access document metadata
70//! assert_eq!(metadata.document.title, Some("My Article".to_string()));
71//! assert_eq!(metadata.document.language, Some("en".to_string()));
72//!
73//! // Access headers
74//! assert_eq!(metadata.headers.len(), 1);
75//! assert_eq!(metadata.headers[0].level, 1);
76//! assert_eq!(metadata.headers[0].id, Some("main".to_string()));
77//!
78//! // Access links
79//! assert_eq!(metadata.links.len(), 1);
80//! assert_eq!(metadata.links[0].link_type, LinkType::External);
81//!
82//! // Access images
83//! assert_eq!(metadata.images.len(), 1);
84//! assert_eq!(metadata.images[0].image_type, ImageType::Relative);
85//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
86//! ```
87//!
88//! ## Selective Extraction
89//!
90//! ```ignore
91//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
92//!
93//! let config = MetadataConfig {
94//! extract_headers: true,
95//! extract_links: true,
96//! extract_images: false, // Skip images
97//! extract_structured_data: false, // Skip structured data
98//! max_structured_data_size: 0,
99//! };
100//!
101//! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
102//! assert_eq!(metadata.images.len(), 0); // Images not extracted
103//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
104//! ```
105//!
106//! ## Analyzing Link Types
107//!
108//! ```ignore
109//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
110//! use html_to_markdown_rs::metadata::LinkType;
111//!
112//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default(), None)?;
113//!
114//! for link in &metadata.links {
115//! match link.link_type {
116//! LinkType::External => println!("External: {}", link.href),
117//! LinkType::Internal => println!("Internal: {}", link.href),
118//! LinkType::Anchor => println!("Anchor: {}", link.href),
119//! LinkType::Email => println!("Email: {}", link.href),
120//! _ => {}
121//! }
122//! }
123//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
124//! ```
125//!
126//! # Serialization
127//!
128//! All types in this module support serialization via `serde` when the `metadata` feature is enabled.
129//! This enables easy export to JSON, YAML, or other formats:
130//!
131//! ```ignore
132//! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
133//!
134//! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default(), None)?;
135//! let json = serde_json::to_string_pretty(&metadata)?;
136//! println!("{}", json);
137//! # Ok::<(), Box<dyn std::error::Error>>(())
138//! ```
139
140use std::cell::RefCell;
141use std::collections::BTreeMap;
142use std::rc::Rc;
143
144/// Text directionality of document content.
145///
146/// Corresponds to the HTML `dir` attribute and `bdi` element directionality.
147#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
148#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
149pub enum TextDirection {
150 /// Left-to-right text flow (default for Latin scripts)
151 #[cfg_attr(feature = "metadata", serde(rename = "ltr"))]
152 LeftToRight,
153 /// Right-to-left text flow (Hebrew, Arabic, Urdu, etc.)
154 #[cfg_attr(feature = "metadata", serde(rename = "rtl"))]
155 RightToLeft,
156 /// Automatic directionality detection
157 #[cfg_attr(feature = "metadata", serde(rename = "auto"))]
158 Auto,
159}
160
161impl std::fmt::Display for TextDirection {
162 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
163 match self {
164 Self::LeftToRight => write!(f, "ltr"),
165 Self::RightToLeft => write!(f, "rtl"),
166 Self::Auto => write!(f, "auto"),
167 }
168 }
169}
170
171impl TextDirection {
172 /// Parse a text direction from string value.
173 ///
174 /// # Arguments
175 ///
176 /// * `s` - Direction string ("ltr", "rtl", or "auto")
177 ///
178 /// # Returns
179 ///
180 /// `Some(TextDirection)` if valid, `None` otherwise.
181 ///
182 /// # Examples
183 ///
184 /// ```
185 /// # use html_to_markdown_rs::metadata::TextDirection;
186 /// assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
187 /// assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
188 /// assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
189 /// assert_eq!(TextDirection::parse("invalid"), None);
190 /// ```
191 #[must_use]
192 pub fn parse(s: &str) -> Option<Self> {
193 if s.eq_ignore_ascii_case("ltr") {
194 return Some(Self::LeftToRight);
195 }
196 if s.eq_ignore_ascii_case("rtl") {
197 return Some(Self::RightToLeft);
198 }
199 if s.eq_ignore_ascii_case("auto") {
200 return Some(Self::Auto);
201 }
202 None
203 }
204}
205
206/// Link classification based on href value and document context.
207///
208/// Used to categorize links during extraction for filtering and analysis.
209#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
210#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
211#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
212pub enum LinkType {
213 /// Anchor link within same document (href starts with #)
214 Anchor,
215 /// Internal link within same domain
216 Internal,
217 /// External link to different domain
218 External,
219 /// Email link (mailto:)
220 Email,
221 /// Phone link (tel:)
222 Phone,
223 /// Other protocol or unclassifiable
224 Other,
225}
226
227impl std::fmt::Display for LinkType {
228 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
229 match self {
230 Self::Anchor => write!(f, "anchor"),
231 Self::Internal => write!(f, "internal"),
232 Self::External => write!(f, "external"),
233 Self::Email => write!(f, "email"),
234 Self::Phone => write!(f, "phone"),
235 Self::Other => write!(f, "other"),
236 }
237 }
238}
239
240/// Image source classification for proper handling and processing.
241///
242/// Determines whether an image is embedded (data URI), inline SVG, external, or relative.
243#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
244#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
245#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
246pub enum ImageType {
247 /// Data URI embedded image (base64 or other encoding)
248 DataUri,
249 /// Inline SVG element
250 InlineSvg,
251 /// External image URL (http/https)
252 External,
253 /// Relative image path
254 Relative,
255}
256
257impl std::fmt::Display for ImageType {
258 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
259 match self {
260 Self::DataUri => write!(f, "data_uri"),
261 Self::InlineSvg => write!(f, "inline_svg"),
262 Self::External => write!(f, "external"),
263 Self::Relative => write!(f, "relative"),
264 }
265 }
266}
267
268/// Structured data format type.
269///
270/// Identifies the schema/format used for structured data markup.
271#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
272#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
273#[cfg_attr(feature = "metadata", serde(rename_all = "snake_case"))]
274pub enum StructuredDataType {
275 /// JSON-LD (JSON for Linking Data) script blocks
276 #[cfg_attr(feature = "metadata", serde(rename = "json_ld"))]
277 JsonLd,
278 /// HTML5 Microdata attributes (itemscope, itemtype, itemprop)
279 Microdata,
280 /// RDF in Attributes (`RDFa`) markup
281 #[cfg_attr(feature = "metadata", serde(rename = "rdfa"))]
282 RDFa,
283}
284
285impl std::fmt::Display for StructuredDataType {
286 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
287 match self {
288 Self::JsonLd => write!(f, "json_ld"),
289 Self::Microdata => write!(f, "microdata"),
290 Self::RDFa => write!(f, "rdfa"),
291 }
292 }
293}
294
295/// Document-level metadata extracted from `<head>` and top-level elements.
296///
297/// Contains all metadata typically used by search engines, social media platforms,
298/// and browsers for document indexing and presentation.
299///
300/// # Examples
301///
302/// ```
303/// # use html_to_markdown_rs::metadata::DocumentMetadata;
304/// let doc = DocumentMetadata {
305/// title: Some("My Article".to_string()),
306/// description: Some("A great article about Rust".to_string()),
307/// keywords: vec!["rust".to_string(), "programming".to_string()],
308/// ..Default::default()
309/// };
310///
311/// assert_eq!(doc.title, Some("My Article".to_string()));
312/// ```
313#[derive(Debug, Clone, Default)]
314#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
315pub struct DocumentMetadata {
316 /// Document title from `<title>` tag
317 pub title: Option<String>,
318
319 /// Document description from `<meta name="description">` tag
320 pub description: Option<String>,
321
322 /// Document keywords from `<meta name="keywords">` tag, split on commas
323 pub keywords: Vec<String>,
324
325 /// Document author from `<meta name="author">` tag
326 pub author: Option<String>,
327
328 /// Canonical URL from `<link rel="canonical">` tag
329 pub canonical_url: Option<String>,
330
331 /// Base URL from `<base href="">` tag for resolving relative URLs
332 pub base_href: Option<String>,
333
334 /// Document language from `lang` attribute
335 pub language: Option<String>,
336
337 /// Document text direction from `dir` attribute
338 pub text_direction: Option<TextDirection>,
339
340 /// Open Graph metadata (og:* properties) for social media
341 /// Keys like "title", "description", "image", "url", etc.
342 pub open_graph: BTreeMap<String, String>,
343
344 /// Twitter Card metadata (twitter:* properties)
345 /// Keys like "card", "site", "creator", "title", "description", "image", etc.
346 pub twitter_card: BTreeMap<String, String>,
347
348 /// Additional meta tags not covered by specific fields
349 /// Keys are meta name/property attributes, values are content
350 pub meta_tags: BTreeMap<String, String>,
351}
352
353/// Header element metadata with hierarchy tracking.
354///
355/// Captures heading elements (h1-h6) with their text content, identifiers,
356/// and position in the document structure.
357///
358/// # Examples
359///
360/// ```
361/// # use html_to_markdown_rs::metadata::HeaderMetadata;
362/// let header = HeaderMetadata {
363/// level: 1,
364/// text: "Main Title".to_string(),
365/// id: Some("main-title".to_string()),
366/// depth: 0,
367/// html_offset: 145,
368/// };
369///
370/// assert_eq!(header.level, 1);
371/// assert!(header.is_valid());
372/// ```
373#[derive(Debug, Clone)]
374#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
375pub struct HeaderMetadata {
376 /// Header level: 1 (h1) through 6 (h6)
377 pub level: u8,
378
379 /// Normalized text content of the header
380 pub text: String,
381
382 /// HTML id attribute if present
383 pub id: Option<String>,
384
385 /// Document tree depth at the header element
386 pub depth: usize,
387
388 /// Byte offset in original HTML document
389 pub html_offset: usize,
390}
391
392impl HeaderMetadata {
393 /// Validate that the header level is within valid range (1-6).
394 ///
395 /// # Returns
396 ///
397 /// `true` if level is 1-6, `false` otherwise.
398 ///
399 /// # Examples
400 ///
401 /// ```
402 /// # use html_to_markdown_rs::metadata::HeaderMetadata;
403 /// let valid = HeaderMetadata {
404 /// level: 3,
405 /// text: "Title".to_string(),
406 /// id: None,
407 /// depth: 2,
408 /// html_offset: 100,
409 /// };
410 /// assert!(valid.is_valid());
411 ///
412 /// let invalid = HeaderMetadata {
413 /// level: 7, // Invalid
414 /// text: "Title".to_string(),
415 /// id: None,
416 /// depth: 2,
417 /// html_offset: 100,
418 /// };
419 /// assert!(!invalid.is_valid());
420 /// ```
421 #[must_use]
422 pub const fn is_valid(&self) -> bool {
423 self.level >= 1 && self.level <= 6
424 }
425}
426
427/// Hyperlink metadata with categorization and attributes.
428///
429/// Represents `<a>` elements with parsed href values, text content, and link type classification.
430///
431/// # Examples
432///
433/// ```
434/// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
435/// let link = LinkMetadata {
436/// href: "https://example.com".to_string(),
437/// text: "Example".to_string(),
438/// title: Some("Visit Example".to_string()),
439/// link_type: LinkType::External,
440/// rel: vec!["nofollow".to_string()],
441/// attributes: Default::default(),
442/// };
443///
444/// assert_eq!(link.link_type, LinkType::External);
445/// assert_eq!(link.text, "Example");
446/// ```
447#[derive(Debug, Clone)]
448#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
449pub struct LinkMetadata {
450 /// The href URL value
451 pub href: String,
452
453 /// Link text content (normalized, concatenated if mixed with elements)
454 pub text: String,
455
456 /// Optional title attribute (often shown as tooltip)
457 pub title: Option<String>,
458
459 /// Link type classification
460 pub link_type: LinkType,
461
462 /// Rel attribute values (e.g., "nofollow", "stylesheet", "canonical")
463 pub rel: Vec<String>,
464
465 /// Additional HTML attributes
466 pub attributes: BTreeMap<String, String>,
467}
468
469impl LinkMetadata {
470 /// Classify a link based on href value.
471 ///
472 /// # Arguments
473 ///
474 /// * `href` - The href attribute value
475 ///
476 /// # Returns
477 ///
478 /// Appropriate [`LinkType`] based on protocol and content.
479 ///
480 /// # Examples
481 ///
482 /// ```
483 /// # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
484 /// assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
485 /// assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
486 /// assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
487 /// assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
488 /// ```
489 #[must_use]
490 pub fn classify_link(href: &str) -> LinkType {
491 if href.starts_with('#') {
492 LinkType::Anchor
493 } else if href.starts_with("mailto:") {
494 LinkType::Email
495 } else if href.starts_with("tel:") {
496 LinkType::Phone
497 } else if href.starts_with("http://") || href.starts_with("https://") {
498 LinkType::External
499 } else if href.starts_with('/') || href.starts_with("../") || href.starts_with("./") {
500 LinkType::Internal
501 } else {
502 LinkType::Other
503 }
504 }
505}
506
507/// Image metadata with source and dimensions.
508///
509/// Captures `<img>` elements and inline `<svg>` elements with metadata
510/// for image analysis and optimization.
511///
512/// # Examples
513///
514/// ```
515/// # use html_to_markdown_rs::metadata::{ImageMetadata, ImageType};
516/// let img = ImageMetadata {
517/// src: "https://example.com/image.jpg".to_string(),
518/// alt: Some("An example image".to_string()),
519/// title: Some("Example".to_string()),
520/// dimensions: Some((800, 600)),
521/// image_type: ImageType::External,
522/// attributes: Default::default(),
523/// };
524///
525/// assert_eq!(img.image_type, ImageType::External);
526/// ```
527#[derive(Debug, Clone)]
528#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
529pub struct ImageMetadata {
530 /// Image source (URL, data URI, or SVG content identifier)
531 pub src: String,
532
533 /// Alternative text from alt attribute (for accessibility)
534 pub alt: Option<String>,
535
536 /// Title attribute (often shown as tooltip)
537 pub title: Option<String>,
538
539 /// Image dimensions as (width, height) if available
540 pub dimensions: Option<(u32, u32)>,
541
542 /// Image type classification
543 pub image_type: ImageType,
544
545 /// Additional HTML attributes
546 pub attributes: BTreeMap<String, String>,
547}
548
549/// Structured data block (JSON-LD, Microdata, or `RDFa`).
550///
551/// Represents machine-readable structured data found in the document.
552/// JSON-LD blocks are collected as raw JSON strings for flexibility.
553///
554/// # Examples
555///
556/// ```
557/// # use html_to_markdown_rs::metadata::{StructuredData, StructuredDataType};
558/// let schema = StructuredData {
559/// data_type: StructuredDataType::JsonLd,
560/// raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
561/// schema_type: Some("Article".to_string()),
562/// };
563///
564/// assert_eq!(schema.data_type, StructuredDataType::JsonLd);
565/// ```
566#[derive(Debug, Clone)]
567#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
568pub struct StructuredData {
569 /// Type of structured data (JSON-LD, Microdata, `RDFa`)
570 pub data_type: StructuredDataType,
571
572 /// Raw JSON string (for JSON-LD) or serialized representation
573 pub raw_json: String,
574
575 /// Schema type if detectable (e.g., "Article", "Event", "Product")
576 pub schema_type: Option<String>,
577}
578
579/// Default maximum size for structured data extraction (1 MB)
580pub const DEFAULT_MAX_STRUCTURED_DATA_SIZE: usize = 1_000_000;
581
582/// Configuration for metadata extraction granularity.
583///
584/// Controls which metadata types are extracted and size limits for safety.
585/// Enables selective extraction of different metadata categories from HTML documents,
586/// allowing fine-grained control over which types of information to collect during
587/// the HTML-to-Markdown conversion process.
588///
589/// # Fields
590///
591/// - `extract_document`: Enable document-level metadata extraction (title, description, author, Open Graph, Twitter Card, etc.)
592/// - `extract_headers`: Enable heading element extraction (h1-h6) with hierarchy tracking
593/// - `extract_links`: Enable anchor element extraction with link type classification
594/// - `extract_images`: Enable image element extraction with source and dimension metadata
595/// - `extract_structured_data`: Enable structured data extraction (JSON-LD, Microdata, `RDFa`)
596/// - `max_structured_data_size`: Safety limit on total structured data size in bytes
597///
598/// # Examples
599///
600/// ```
601/// # use html_to_markdown_rs::metadata::MetadataConfig;
602/// let config = MetadataConfig {
603/// extract_document: true,
604/// extract_headers: true,
605/// extract_links: true,
606/// extract_images: true,
607/// extract_structured_data: true,
608/// max_structured_data_size: 1_000_000,
609/// };
610///
611/// assert!(config.extract_headers);
612/// ```
613#[derive(Debug, Clone)]
614#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
615pub struct MetadataConfig {
616 /// Extract document-level metadata (title, description, author, etc.).
617 ///
618 /// When enabled, collects metadata from `<head>` section including:
619 /// - `<title>` element content
620 /// - `<meta name="description">` and other standard meta tags
621 /// - Open Graph (og:*) properties for social media optimization
622 /// - Twitter Card (twitter:*) properties
623 /// - Language and text direction attributes
624 /// - Canonical URL and base href references
625 pub extract_document: bool,
626
627 /// Extract h1-h6 header elements and their hierarchy.
628 ///
629 /// When enabled, collects all heading elements with:
630 /// - Header level (1-6)
631 /// - Text content (normalized)
632 /// - HTML id attribute if present
633 /// - Document tree depth for hierarchy tracking
634 /// - Byte offset in original HTML for positioning
635 pub extract_headers: bool,
636
637 /// Extract anchor (a) elements as links with type classification.
638 ///
639 /// When enabled, collects all hyperlinks with:
640 /// - href attribute value
641 /// - Link text content
642 /// - Title attribute (tooltip text)
643 /// - Automatic link type classification (anchor, internal, external, email, phone, other)
644 /// - Rel attribute values
645 /// - Additional custom attributes
646 pub extract_links: bool,
647
648 /// Extract image elements and data URIs.
649 ///
650 /// When enabled, collects all image elements with:
651 /// - Source URL or data URI
652 /// - Alt text for accessibility
653 /// - Title attribute
654 /// - Dimensions (width, height) if available
655 /// - Automatic image type classification (data URI, external, relative, inline SVG)
656 /// - Additional custom attributes
657 pub extract_images: bool,
658
659 /// Extract structured data (JSON-LD, Microdata, `RDFa`).
660 ///
661 /// When enabled, collects machine-readable structured data including:
662 /// - JSON-LD script blocks with schema detection
663 /// - Microdata attributes (itemscope, itemtype, itemprop)
664 /// - `RDFa` markup
665 /// - Extracted schema type if detectable
666 pub extract_structured_data: bool,
667
668 /// Maximum total size of structured data to collect (bytes).
669 ///
670 /// Prevents memory exhaustion attacks on malformed or adversarial documents
671 /// containing excessively large structured data blocks. When the accumulated
672 /// size of structured data exceeds this limit, further collection stops.
673 /// Default: `1_000_000` bytes (1 MB)
674 pub max_structured_data_size: usize,
675}
676
677/// Partial update for `MetadataConfig`.
678///
679/// This struct uses `Option<T>` to represent optional fields that can be selectively updated.
680/// Only specified fields (Some values) will override existing config; None values leave the
681/// corresponding fields unchanged when applied via [`MetadataConfig::apply_update`].
682///
683/// # Fields
684///
685/// - `extract_document`: Optional override for document-level metadata extraction
686/// - `extract_headers`: Optional override for heading element extraction
687/// - `extract_links`: Optional override for link element extraction
688/// - `extract_images`: Optional override for image element extraction
689/// - `extract_structured_data`: Optional override for structured data extraction
690/// - `max_structured_data_size`: Optional override for structured data size limit
691///
692/// # Examples
693///
694/// ```
695/// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
696/// let update = MetadataConfigUpdate {
697/// extract_document: Some(false),
698/// extract_headers: Some(true),
699/// extract_links: None, // No change
700/// extract_images: None, // No change
701/// extract_structured_data: None, // No change
702/// max_structured_data_size: None, // No change
703/// };
704///
705/// let mut config = MetadataConfig::default();
706/// config.apply_update(update);
707/// assert!(!config.extract_document);
708/// assert!(config.extract_headers);
709/// ```
710#[derive(Debug, Clone, Default)]
711#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
712#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
713pub struct MetadataConfigUpdate {
714 /// Optional override for extracting document-level metadata.
715 ///
716 /// When Some(true), enables document metadata extraction; Some(false) disables it.
717 /// None leaves the current setting unchanged.
718 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_document"))]
719 pub extract_document: Option<bool>,
720
721 /// Optional override for extracting heading elements (h1-h6).
722 ///
723 /// When Some(true), enables header extraction; Some(false) disables it.
724 /// None leaves the current setting unchanged.
725 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_headers"))]
726 pub extract_headers: Option<bool>,
727
728 /// Optional override for extracting anchor (link) elements.
729 ///
730 /// When Some(true), enables link extraction; Some(false) disables it.
731 /// None leaves the current setting unchanged.
732 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_links"))]
733 pub extract_links: Option<bool>,
734
735 /// Optional override for extracting image elements.
736 ///
737 /// When Some(true), enables image extraction; Some(false) disables it.
738 /// None leaves the current setting unchanged.
739 #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(alias = "extract_images"))]
740 pub extract_images: Option<bool>,
741
742 /// Optional override for extracting structured data (JSON-LD, Microdata, `RDFa`).
743 ///
744 /// When Some(true), enables structured data extraction; Some(false) disables it.
745 /// None leaves the current setting unchanged.
746 #[cfg_attr(
747 any(feature = "serde", feature = "metadata"),
748 serde(alias = "extract_structured_data")
749 )]
750 pub extract_structured_data: Option<bool>,
751
752 /// Optional override for maximum structured data collection size in bytes.
753 ///
754 /// When Some(size), sets the new size limit. None leaves the current limit unchanged.
755 /// Use this to adjust safety thresholds for different documents.
756 #[cfg_attr(
757 any(feature = "serde", feature = "metadata"),
758 serde(alias = "max_structured_data_size")
759 )]
760 pub max_structured_data_size: Option<usize>,
761}
762
763impl Default for MetadataConfig {
764 /// Create default metadata configuration.
765 ///
766 /// Defaults to extracting all metadata types with 1MB limit on structured data.
767 fn default() -> Self {
768 Self {
769 extract_document: true,
770 extract_headers: true,
771 extract_links: true,
772 extract_images: true,
773 extract_structured_data: true,
774 max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
775 }
776 }
777}
778
779impl MetadataConfig {
780 /// Check if any metadata extraction is enabled.
781 ///
782 /// Returns `true` if at least one extraction category is enabled, `false` if all are disabled.
783 /// This is useful for early exit optimization when the application doesn't need metadata.
784 ///
785 /// # Returns
786 ///
787 /// `true` if any of the extraction flags are enabled, `false` if all are disabled.
788 ///
789 /// # Examples
790 ///
791 /// ```
792 /// # use html_to_markdown_rs::metadata::MetadataConfig;
793 /// // All enabled
794 /// let config = MetadataConfig::default();
795 /// assert!(config.any_enabled());
796 ///
797 /// // Selectively enabled
798 /// let config = MetadataConfig {
799 /// extract_headers: true,
800 /// extract_document: false,
801 /// extract_links: false,
802 /// extract_images: false,
803 /// extract_structured_data: false,
804 /// max_structured_data_size: 1_000_000,
805 /// };
806 /// assert!(config.any_enabled());
807 ///
808 /// // All disabled
809 /// let config = MetadataConfig {
810 /// extract_document: false,
811 /// extract_headers: false,
812 /// extract_links: false,
813 /// extract_images: false,
814 /// extract_structured_data: false,
815 /// max_structured_data_size: 1_000_000,
816 /// };
817 /// assert!(!config.any_enabled());
818 /// ```
819 #[must_use]
820 pub const fn any_enabled(&self) -> bool {
821 self.extract_document
822 || self.extract_headers
823 || self.extract_links
824 || self.extract_images
825 || self.extract_structured_data
826 }
827
828 /// Apply a partial update to this metadata configuration.
829 ///
830 /// Any specified fields in the update (Some values) will override the current values.
831 /// Unspecified fields (None) are left unchanged. This allows selective modification
832 /// of configuration without affecting unrelated settings.
833 ///
834 /// # Arguments
835 ///
836 /// * `update` - Partial metadata config update with fields to override
837 ///
838 /// # Examples
839 ///
840 /// ```
841 /// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
842 /// let mut config = MetadataConfig::default();
843 /// // config starts with all extraction enabled
844 ///
845 /// let update = MetadataConfigUpdate {
846 /// extract_document: Some(false),
847 /// extract_images: Some(false),
848 /// // All other fields are None, so they won't change
849 /// ..Default::default()
850 /// };
851 ///
852 /// config.apply_update(update);
853 ///
854 /// assert!(!config.extract_document);
855 /// assert!(!config.extract_images);
856 /// assert!(config.extract_headers); // Unchanged
857 /// assert!(config.extract_links); // Unchanged
858 /// ```
859 pub const fn apply_update(&mut self, update: MetadataConfigUpdate) {
860 if let Some(extract_document) = update.extract_document {
861 self.extract_document = extract_document;
862 }
863 if let Some(extract_headers) = update.extract_headers {
864 self.extract_headers = extract_headers;
865 }
866 if let Some(extract_links) = update.extract_links {
867 self.extract_links = extract_links;
868 }
869 if let Some(extract_images) = update.extract_images {
870 self.extract_images = extract_images;
871 }
872 if let Some(extract_structured_data) = update.extract_structured_data {
873 self.extract_structured_data = extract_structured_data;
874 }
875 if let Some(max_structured_data_size) = update.max_structured_data_size {
876 self.max_structured_data_size = max_structured_data_size;
877 }
878 }
879
880 /// Create new metadata configuration from a partial update.
881 ///
882 /// Creates a new `MetadataConfig` struct with defaults, then applies the update.
883 /// Fields not specified in the update (None) keep their default values.
884 /// This is a convenience method for constructing a configuration from a partial specification
885 /// without needing to explicitly call `.default()` first.
886 ///
887 /// # Arguments
888 ///
889 /// * `update` - Partial metadata config update with fields to set
890 ///
891 /// # Returns
892 ///
893 /// New `MetadataConfig` with specified updates applied to defaults
894 ///
895 /// # Examples
896 ///
897 /// ```
898 /// # use html_to_markdown_rs::metadata::{MetadataConfig, MetadataConfigUpdate};
899 /// let update = MetadataConfigUpdate {
900 /// extract_document: Some(false),
901 /// extract_headers: Some(true),
902 /// extract_links: Some(true),
903 /// extract_images: None, // Will use default (true)
904 /// extract_structured_data: None, // Will use default (true)
905 /// max_structured_data_size: None, // Will use default (1MB)
906 /// };
907 ///
908 /// let config = MetadataConfig::from_update(update);
909 ///
910 /// assert!(!config.extract_document);
911 /// assert!(config.extract_headers);
912 /// assert!(config.extract_links);
913 /// assert!(config.extract_images); // Default
914 /// assert!(config.extract_structured_data); // Default
915 /// ```
916 #[must_use]
917 pub fn from_update(update: MetadataConfigUpdate) -> Self {
918 let mut config = Self::default();
919 config.apply_update(update);
920 config
921 }
922}
923
924impl From<MetadataConfigUpdate> for MetadataConfig {
925 fn from(update: MetadataConfigUpdate) -> Self {
926 Self::from_update(update)
927 }
928}
929
930/// Comprehensive metadata extraction result from HTML document.
931///
932/// Contains all extracted metadata types in a single structure,
933/// suitable for serialization and transmission across language boundaries.
934///
935/// # Examples
936///
937/// ```
938/// # use html_to_markdown_rs::metadata::ExtendedMetadata;
939/// let metadata = ExtendedMetadata {
940/// document: Default::default(),
941/// headers: Vec::new(),
942/// links: Vec::new(),
943/// images: Vec::new(),
944/// structured_data: Vec::new(),
945/// };
946///
947/// assert!(metadata.headers.is_empty());
948/// ```
949#[derive(Debug, Clone, Default)]
950#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
951pub struct ExtendedMetadata {
952 /// Document-level metadata (title, description, canonical, etc.)
953 pub document: DocumentMetadata,
954
955 /// Extracted header elements with hierarchy
956 pub headers: Vec<HeaderMetadata>,
957
958 /// Extracted hyperlinks with type classification
959 pub links: Vec<LinkMetadata>,
960
961 /// Extracted images with source and dimensions
962 pub images: Vec<ImageMetadata>,
963
964 /// Extracted structured data blocks
965 pub structured_data: Vec<StructuredData>,
966}
967
968/// Internal metadata collector for single-pass extraction.
969///
970/// Follows the [`InlineImageCollector`](crate::inline_images::InlineImageCollector) pattern
971/// for efficient metadata extraction during tree traversal. Maintains state for:
972/// - Document metadata from head elements
973/// - Header hierarchy tracking
974/// - Link accumulation
975/// - Structured data collection
976/// - Language and directionality attributes
977///
978/// # Architecture
979///
980/// The collector is designed to be:
981/// - **Performant**: Pre-allocated collections, minimal cloning
982/// - **Single-pass**: Collects during main tree walk without separate passes
983/// - **Optional**: Zero overhead when disabled via feature flags
984/// - **Type-safe**: Strict separation of collection and result types
985///
986/// # Internal State
987///
988/// - `head_metadata`: Raw metadata pairs from head element
989/// - `headers`: Collected header elements
990/// - `header_stack`: For tracking nesting depth
991/// - `links`: Collected link elements
992/// - `base_href`: Base URL for relative link resolution
993/// - `json_ld`: JSON-LD script block contents
994/// - `lang`: Document language
995/// - `dir`: Document text direction
996#[derive(Debug)]
997#[allow(dead_code)]
998pub(crate) struct MetadataCollector {
999 head_metadata: BTreeMap<String, String>,
1000 headers: Vec<HeaderMetadata>,
1001 header_stack: Vec<usize>,
1002 links: Vec<LinkMetadata>,
1003 images: Vec<ImageMetadata>,
1004 json_ld: Vec<String>,
1005 structured_data_size: usize,
1006 config: MetadataConfig,
1007 lang: Option<String>,
1008 dir: Option<String>,
1009}
1010
1011#[allow(dead_code)]
1012impl MetadataCollector {
1013 /// Create a new metadata collector with configuration.
1014 ///
1015 /// Pre-allocates collections based on typical document sizes
1016 /// for efficient append operations during traversal.
1017 ///
1018 /// # Arguments
1019 ///
1020 /// * `config` - Extraction configuration specifying which types to collect
1021 ///
1022 /// # Returns
1023 ///
1024 /// A new collector ready for use during tree traversal.
1025 ///
1026 /// # Examples
1027 ///
1028 /// ```ignore
1029 /// let config = MetadataConfig::default();
1030 /// let collector = MetadataCollector::new(config);
1031 /// ```
1032 pub(crate) fn new(config: MetadataConfig) -> Self {
1033 Self {
1034 head_metadata: BTreeMap::new(),
1035 headers: Vec::with_capacity(32),
1036 header_stack: Vec::with_capacity(6),
1037 links: Vec::with_capacity(64),
1038 images: Vec::with_capacity(16),
1039 json_ld: Vec::with_capacity(4),
1040 structured_data_size: 0,
1041 config,
1042 lang: None,
1043 dir: None,
1044 }
1045 }
1046
1047 /// Add a header element to the collection.
1048 ///
1049 /// Validates that level is in range 1-6 and tracks hierarchy via depth.
1050 ///
1051 /// # Arguments
1052 ///
1053 /// * `level` - Header level (1-6)
1054 /// * `text` - Normalized header text content
1055 /// * `id` - Optional HTML id attribute
1056 /// * `depth` - Current document nesting depth
1057 /// * `html_offset` - Byte offset in original HTML
1058 pub(crate) fn add_header(&mut self, level: u8, text: String, id: Option<String>, depth: usize, html_offset: usize) {
1059 if !self.config.extract_headers {
1060 return;
1061 }
1062
1063 if !(1..=6).contains(&level) {
1064 return;
1065 }
1066
1067 let header = HeaderMetadata {
1068 level,
1069 text,
1070 id,
1071 depth,
1072 html_offset,
1073 };
1074
1075 self.headers.push(header);
1076 }
1077
1078 /// Add a link element to the collection.
1079 ///
1080 /// Classifies the link based on href value and stores with metadata.
1081 ///
1082 /// # Arguments
1083 ///
1084 /// * `href` - The href attribute value
1085 /// * `text` - Link text content
1086 /// * `title` - Optional title attribute
1087 /// * `rel` - Comma/space-separated rel attribute value
1088 /// * `attributes` - Additional attributes to capture (e.g., data-* or aria-* values)
1089 pub(crate) fn add_link(
1090 &mut self,
1091 href: String,
1092 text: String,
1093 title: Option<String>,
1094 rel: Option<String>,
1095 attributes: BTreeMap<String, String>,
1096 ) {
1097 if !self.config.extract_links {
1098 return;
1099 }
1100
1101 let link_type = LinkMetadata::classify_link(&href);
1102
1103 let rel_vec = rel
1104 .map(|r| {
1105 r.split_whitespace()
1106 .map(std::string::ToString::to_string)
1107 .collect::<Vec<_>>()
1108 })
1109 .unwrap_or_default();
1110
1111 let link = LinkMetadata {
1112 href,
1113 text,
1114 title,
1115 link_type,
1116 rel: rel_vec,
1117 attributes,
1118 };
1119
1120 self.links.push(link);
1121 }
1122
1123 /// Add an image element to the collection.
1124 ///
1125 /// # Arguments
1126 ///
1127 /// * `src` - Image source (URL or data URI)
1128 /// * `alt` - Optional alt text
1129 /// * `title` - Optional title attribute
1130 /// * `dimensions` - Optional (width, height) tuple
1131 pub(crate) fn add_image(
1132 &mut self,
1133 src: String,
1134 alt: Option<String>,
1135 title: Option<String>,
1136 dimensions: Option<(u32, u32)>,
1137 attributes: BTreeMap<String, String>,
1138 ) {
1139 if !self.config.extract_images {
1140 return;
1141 }
1142
1143 let image_type = if src.starts_with("data:") {
1144 ImageType::DataUri
1145 } else if src.starts_with("http://") || src.starts_with("https://") {
1146 ImageType::External
1147 } else if src.starts_with('<') && src.contains("svg") {
1148 ImageType::InlineSvg
1149 } else {
1150 ImageType::Relative
1151 };
1152
1153 let image = ImageMetadata {
1154 src,
1155 alt,
1156 title,
1157 dimensions,
1158 image_type,
1159 attributes,
1160 };
1161
1162 self.images.push(image);
1163 }
1164
1165 /// Add a JSON-LD structured data block.
1166 ///
1167 /// Accumulates JSON content with size validation against configured limits.
1168 ///
1169 /// # Arguments
1170 ///
1171 /// * `json_content` - Raw JSON string content
1172 pub(crate) fn add_json_ld(&mut self, json_content: String) {
1173 if !self.config.extract_structured_data {
1174 return;
1175 }
1176
1177 let content_size = json_content.len();
1178 if content_size > self.config.max_structured_data_size {
1179 return;
1180 }
1181 if self.structured_data_size + content_size > self.config.max_structured_data_size {
1182 return;
1183 }
1184
1185 self.structured_data_size += content_size;
1186 self.json_ld.push(json_content);
1187 }
1188
1189 /// Set document head metadata from extracted head section.
1190 ///
1191 /// Merges metadata pairs from head elements (meta, title, link, etc.)
1192 /// into the collector's head metadata store.
1193 ///
1194 /// # Arguments
1195 ///
1196 /// * `metadata` - `BTreeMap` of metadata key-value pairs
1197 pub(crate) fn set_head_metadata(&mut self, metadata: BTreeMap<String, String>) {
1198 if !self.config.extract_document {
1199 return;
1200 }
1201 self.head_metadata.extend(metadata);
1202 }
1203
1204 /// Set document language attribute.
1205 ///
1206 /// Usually from `lang` attribute on `<html>` or `<body>` tag.
1207 /// Only sets if not already set (first occurrence wins).
1208 ///
1209 /// # Arguments
1210 ///
1211 /// * `lang` - Language code (e.g., "en", "es", "fr")
1212 pub(crate) fn set_language(&mut self, lang: String) {
1213 if !self.config.extract_document {
1214 return;
1215 }
1216 if self.lang.is_none() {
1217 self.lang = Some(lang);
1218 }
1219 }
1220
1221 /// Set document text direction attribute.
1222 ///
1223 /// Usually from `dir` attribute on `<html>` or `<body>` tag.
1224 /// Only sets if not already set (first occurrence wins).
1225 ///
1226 /// # Arguments
1227 ///
1228 /// * `dir` - Direction string ("ltr", "rtl", or "auto")
1229 pub(crate) fn set_text_direction(&mut self, dir: String) {
1230 if !self.config.extract_document {
1231 return;
1232 }
1233 if self.dir.is_none() {
1234 self.dir = Some(dir);
1235 }
1236 }
1237
1238 pub(crate) const fn wants_document(&self) -> bool {
1239 self.config.extract_document
1240 }
1241
1242 pub(crate) const fn wants_headers(&self) -> bool {
1243 self.config.extract_headers
1244 }
1245
1246 pub(crate) const fn wants_links(&self) -> bool {
1247 self.config.extract_links
1248 }
1249
1250 pub(crate) const fn wants_images(&self) -> bool {
1251 self.config.extract_images
1252 }
1253
1254 pub(crate) const fn wants_structured_data(&self) -> bool {
1255 self.config.extract_structured_data
1256 }
1257
1258 /// Extract document metadata from collected head metadata.
1259 ///
1260 /// Parses head metadata into structured document metadata,
1261 /// handling special cases like Open Graph, Twitter Card, keywords, etc.
1262 #[allow(dead_code)]
1263 fn extract_document_metadata(
1264 head_metadata: BTreeMap<String, String>,
1265 lang: Option<String>,
1266 dir: Option<String>,
1267 ) -> DocumentMetadata {
1268 let mut doc = DocumentMetadata::default();
1269
1270 for (raw_key, value) in head_metadata {
1271 let mut key = raw_key.as_str();
1272 let mut replaced_key: Option<String> = None;
1273
1274 if let Some(stripped) = key.strip_prefix("meta-") {
1275 key = stripped;
1276 }
1277
1278 if key.as_bytes().contains(&b':') {
1279 replaced_key = Some(key.replace(':', "-"));
1280 key = replaced_key.as_deref().unwrap_or(key);
1281 }
1282
1283 match key {
1284 "title" => doc.title = Some(value),
1285 "description" => doc.description = Some(value),
1286 "author" => doc.author = Some(value),
1287 "canonical" => doc.canonical_url = Some(value),
1288 "base" | "base-href" => doc.base_href = Some(value),
1289 key if key.starts_with("og-") => {
1290 let og_key = if key.as_bytes().contains(&b'-') {
1291 key.trim_start_matches("og-").replace('-', "_")
1292 } else {
1293 key.trim_start_matches("og-").to_string()
1294 };
1295 doc.open_graph.insert(og_key, value);
1296 }
1297 key if key.starts_with("twitter-") => {
1298 let tw_key = if key.as_bytes().contains(&b'-') {
1299 key.trim_start_matches("twitter-").replace('-', "_")
1300 } else {
1301 key.trim_start_matches("twitter-").to_string()
1302 };
1303 doc.twitter_card.insert(tw_key, value);
1304 }
1305 "keywords" => {
1306 doc.keywords = value
1307 .split(',')
1308 .map(|s| s.trim().to_string())
1309 .filter(|s| !s.is_empty())
1310 .collect();
1311 }
1312 _ => {
1313 let meta_key = if key.as_ptr() == raw_key.as_ptr() && key.len() == raw_key.len() {
1314 raw_key
1315 } else if let Some(replaced) = replaced_key {
1316 replaced
1317 } else {
1318 key.to_string()
1319 };
1320 doc.meta_tags.insert(meta_key, value);
1321 }
1322 }
1323 }
1324
1325 if let Some(lang) = lang {
1326 doc.language = Some(lang);
1327 }
1328
1329 if let Some(dir) = dir {
1330 if let Some(parsed_dir) = TextDirection::parse(&dir) {
1331 doc.text_direction = Some(parsed_dir);
1332 }
1333 }
1334
1335 doc
1336 }
1337
1338 /// Extract structured data blocks into `StructuredData` items.
1339 #[allow(dead_code)]
1340 fn extract_structured_data(json_ld: Vec<String>) -> Vec<StructuredData> {
1341 let mut result = Vec::with_capacity(json_ld.len());
1342
1343 for json_str in json_ld {
1344 let schema_type = Self::scan_schema_type(&json_str)
1345 .or_else(|| {
1346 if json_str.contains("\"@type\"") {
1347 serde_json::from_str::<serde_json::Value>(&json_str).ok().and_then(|v| {
1348 v.get("@type")
1349 .and_then(|t| t.as_str().map(std::string::ToString::to_string))
1350 })
1351 } else {
1352 None
1353 }
1354 })
1355 .or_else(|| {
1356 if !json_str.contains("\"@graph\"") {
1357 return None;
1358 }
1359
1360 let value = serde_json::from_str::<serde_json::Value>(&json_str).ok()?;
1361 let graph = value.get("@graph")?;
1362 let items = graph.as_array()?;
1363 items.iter().find_map(|item| {
1364 item.get("@type")
1365 .and_then(|t| t.as_str().map(std::string::ToString::to_string))
1366 })
1367 });
1368
1369 result.push(StructuredData {
1370 data_type: StructuredDataType::JsonLd,
1371 raw_json: json_str,
1372 schema_type,
1373 });
1374 }
1375
1376 result
1377 }
1378
1379 fn scan_schema_type(json_str: &str) -> Option<String> {
1380 let needle = "\"@type\"";
1381 let start = json_str.find(needle)? + needle.len();
1382 let bytes = json_str.as_bytes();
1383 let mut i = start;
1384
1385 while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1386 i += 1;
1387 }
1388 if i >= bytes.len() || bytes[i] != b':' {
1389 return None;
1390 }
1391 i += 1;
1392 while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1393 i += 1;
1394 }
1395 if i >= bytes.len() {
1396 return None;
1397 }
1398
1399 if bytes[i] == b'[' {
1400 i += 1;
1401 while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1402 i += 1;
1403 }
1404 if i >= bytes.len() || bytes[i] != b'"' {
1405 return None;
1406 }
1407 } else if bytes[i] != b'"' {
1408 return None;
1409 }
1410
1411 let start_quote = i;
1412 i += 1;
1413 let mut escaped = false;
1414 while i < bytes.len() {
1415 let byte = bytes[i];
1416 if escaped {
1417 escaped = false;
1418 i += 1;
1419 continue;
1420 }
1421 if byte == b'\\' {
1422 escaped = true;
1423 i += 1;
1424 continue;
1425 }
1426 if byte == b'"' {
1427 let end_quote = i;
1428 let slice = &json_str[start_quote..=end_quote];
1429 return serde_json::from_str::<String>(slice).ok();
1430 }
1431 i += 1;
1432 }
1433
1434 None
1435 }
1436
1437 /// Finish collection and return all extracted metadata.
1438 ///
1439 /// Performs final processing, validation, and consolidation of all
1440 /// collected data into the [`ExtendedMetadata`] output structure.
1441 ///
1442 /// # Returns
1443 ///
1444 /// Complete [`ExtendedMetadata`] with all extracted information.
1445 #[allow(dead_code)]
1446 pub(crate) fn finish(self) -> ExtendedMetadata {
1447 let structured_data = Self::extract_structured_data(self.json_ld);
1448 let document = Self::extract_document_metadata(self.head_metadata, self.lang, self.dir);
1449
1450 ExtendedMetadata {
1451 document,
1452 headers: self.headers,
1453 links: self.links,
1454 images: self.images,
1455 structured_data,
1456 }
1457 }
1458
1459 /// Categorize links by type for analysis and filtering.
1460 ///
1461 /// Separates collected links into groups by [`LinkType`].
1462 /// This is an analysis helper method; actual categorization happens during `add_link`.
1463 ///
1464 /// # Returns
1465 ///
1466 /// `BTreeMap` with `LinkType` as key and Vec of matching `LinkMetadata` as value.
1467 #[allow(dead_code)]
1468 pub(crate) fn categorize_links(&self) -> BTreeMap<String, Vec<&LinkMetadata>> {
1469 let mut categorized: BTreeMap<String, Vec<&LinkMetadata>> = BTreeMap::new();
1470
1471 for link in &self.links {
1472 let category = link.link_type.to_string();
1473 categorized.entry(category).or_default().push(link);
1474 }
1475
1476 categorized
1477 }
1478
1479 /// Count headers by level for structural analysis.
1480 ///
1481 /// Returns count of headers at each level (1-6).
1482 ///
1483 /// # Returns
1484 ///
1485 /// `BTreeMap` with level as string key and count as value.
1486 #[allow(dead_code)]
1487 pub(crate) fn header_counts(&self) -> BTreeMap<String, usize> {
1488 let mut counts: BTreeMap<String, usize> = BTreeMap::new();
1489
1490 for header in &self.headers {
1491 *counts.entry(header.level.to_string()).or_insert(0) += 1;
1492 }
1493
1494 counts
1495 }
1496}
1497
1498/// Handle to a metadata collector via reference-counted mutable cell.
1499///
1500/// Used internally for sharing collector state across the tree traversal.
1501/// Matches the pattern used for [`InlineImageCollector`](crate::inline_images::InlineImageCollector).
1502///
1503/// # Examples
1504///
1505/// ```ignore
1506/// let collector = MetadataCollector::new(MetadataConfig::default());
1507/// let handle = Rc::new(RefCell::new(collector));
1508///
1509/// // In tree walk, can be passed and borrowed
1510/// handle.borrow_mut().add_header(1, "Title".to_string(), None, 0, 100);
1511///
1512/// let metadata = handle.take().finish();
1513/// ```
1514#[allow(dead_code)]
1515pub(crate) type MetadataCollectorHandle = Rc<RefCell<MetadataCollector>>;
1516
1517#[cfg(test)]
1518mod tests {
1519 use super::*;
1520
1521 #[test]
1522 fn test_text_direction_parse() {
1523 assert_eq!(TextDirection::parse("ltr"), Some(TextDirection::LeftToRight));
1524 assert_eq!(TextDirection::parse("rtl"), Some(TextDirection::RightToLeft));
1525 assert_eq!(TextDirection::parse("auto"), Some(TextDirection::Auto));
1526 assert_eq!(TextDirection::parse("invalid"), None);
1527 assert_eq!(TextDirection::parse("LTR"), Some(TextDirection::LeftToRight));
1528 }
1529
1530 #[test]
1531 fn test_text_direction_display() {
1532 assert_eq!(TextDirection::LeftToRight.to_string(), "ltr");
1533 assert_eq!(TextDirection::RightToLeft.to_string(), "rtl");
1534 assert_eq!(TextDirection::Auto.to_string(), "auto");
1535 }
1536
1537 #[test]
1538 fn test_link_classification() {
1539 assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
1540 assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
1541 assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
1542 assert_eq!(LinkMetadata::classify_link("https://example.com"), LinkType::External);
1543 assert_eq!(LinkMetadata::classify_link("http://example.com"), LinkType::External);
1544 assert_eq!(LinkMetadata::classify_link("/path/to/page"), LinkType::Internal);
1545 assert_eq!(LinkMetadata::classify_link("../relative"), LinkType::Internal);
1546 assert_eq!(LinkMetadata::classify_link("./same"), LinkType::Internal);
1547 }
1548
1549 #[test]
1550 fn test_header_validation() {
1551 let valid = HeaderMetadata {
1552 level: 3,
1553 text: "Title".to_string(),
1554 id: None,
1555 depth: 2,
1556 html_offset: 100,
1557 };
1558 assert!(valid.is_valid());
1559
1560 let invalid_high = HeaderMetadata {
1561 level: 7,
1562 text: "Title".to_string(),
1563 id: None,
1564 depth: 2,
1565 html_offset: 100,
1566 };
1567 assert!(!invalid_high.is_valid());
1568
1569 let invalid_low = HeaderMetadata {
1570 level: 0,
1571 text: "Title".to_string(),
1572 id: None,
1573 depth: 2,
1574 html_offset: 100,
1575 };
1576 assert!(!invalid_low.is_valid());
1577 }
1578
1579 #[test]
1580 fn test_metadata_collector_new() {
1581 let config = MetadataConfig::default();
1582 let collector = MetadataCollector::new(config);
1583
1584 assert_eq!(collector.headers.capacity(), 32);
1585 assert_eq!(collector.links.capacity(), 64);
1586 assert_eq!(collector.images.capacity(), 16);
1587 assert_eq!(collector.json_ld.capacity(), 4);
1588 }
1589
1590 #[test]
1591 fn test_metadata_collector_add_header() {
1592 let config = MetadataConfig::default();
1593 let mut collector = MetadataCollector::new(config);
1594
1595 collector.add_header(1, "Title".to_string(), Some("title".to_string()), 0, 100);
1596 assert_eq!(collector.headers.len(), 1);
1597
1598 let header = &collector.headers[0];
1599 assert_eq!(header.level, 1);
1600 assert_eq!(header.text, "Title");
1601 assert_eq!(header.id, Some("title".to_string()));
1602
1603 collector.add_header(7, "Invalid".to_string(), None, 0, 200);
1604 assert_eq!(collector.headers.len(), 1);
1605 }
1606
1607 #[test]
1608 fn test_metadata_collector_add_link() {
1609 let config = MetadataConfig::default();
1610 let mut collector = MetadataCollector::new(config);
1611
1612 collector.add_link(
1613 "https://example.com".to_string(),
1614 "Example".to_string(),
1615 Some("Visit".to_string()),
1616 Some("nofollow external".to_string()),
1617 BTreeMap::from([("data-id".to_string(), "example".to_string())]),
1618 );
1619
1620 assert_eq!(collector.links.len(), 1);
1621
1622 let link = &collector.links[0];
1623 assert_eq!(link.href, "https://example.com");
1624 assert_eq!(link.text, "Example");
1625 assert_eq!(link.link_type, LinkType::External);
1626 assert_eq!(link.rel, vec!["nofollow", "external"]);
1627 assert_eq!(link.attributes.get("data-id"), Some(&"example".to_string()));
1628 }
1629
1630 #[test]
1631 fn test_metadata_collector_respects_config() {
1632 let config = MetadataConfig {
1633 extract_document: false,
1634 extract_headers: false,
1635 extract_links: false,
1636 extract_images: false,
1637 extract_structured_data: false,
1638 max_structured_data_size: DEFAULT_MAX_STRUCTURED_DATA_SIZE,
1639 };
1640 let mut collector = MetadataCollector::new(config);
1641
1642 collector.add_header(1, "Title".to_string(), None, 0, 100);
1643 collector.add_link(
1644 "https://example.com".to_string(),
1645 "Link".to_string(),
1646 None,
1647 None,
1648 BTreeMap::new(),
1649 );
1650 collector.add_image(
1651 "https://example.com/img.jpg".to_string(),
1652 None,
1653 None,
1654 None,
1655 BTreeMap::new(),
1656 );
1657 collector.add_json_ld("{}".to_string());
1658
1659 assert!(collector.headers.is_empty());
1660 assert!(collector.links.is_empty());
1661 assert!(collector.images.is_empty());
1662 assert!(collector.json_ld.is_empty());
1663 }
1664
1665 #[test]
1666 fn test_metadata_collector_finish() {
1667 let config = MetadataConfig::default();
1668 let mut collector = MetadataCollector::new(config);
1669
1670 collector.set_language("en".to_string());
1671 collector.add_header(1, "Main Title".to_string(), None, 0, 100);
1672 collector.add_link(
1673 "https://example.com".to_string(),
1674 "Example".to_string(),
1675 None,
1676 None,
1677 BTreeMap::new(),
1678 );
1679
1680 let metadata = collector.finish();
1681
1682 assert_eq!(metadata.document.language, Some("en".to_string()));
1683 assert_eq!(metadata.headers.len(), 1);
1684 assert_eq!(metadata.links.len(), 1);
1685 }
1686
1687 #[test]
1688 fn test_document_metadata_default() {
1689 let doc = DocumentMetadata::default();
1690
1691 assert!(doc.title.is_none());
1692 assert!(doc.description.is_none());
1693 assert!(doc.keywords.is_empty());
1694 assert!(doc.open_graph.is_empty());
1695 assert!(doc.twitter_card.is_empty());
1696 assert!(doc.meta_tags.is_empty());
1697 }
1698
1699 #[test]
1700 fn test_metadata_config_default() {
1701 let config = MetadataConfig::default();
1702
1703 assert!(config.extract_headers);
1704 assert!(config.extract_links);
1705 assert!(config.extract_images);
1706 assert!(config.extract_structured_data);
1707 assert_eq!(config.max_structured_data_size, DEFAULT_MAX_STRUCTURED_DATA_SIZE);
1708 }
1709
1710 #[test]
1711 fn test_image_type_classification() {
1712 let data_uri = ImageMetadata {
1713 src: "...".to_string(),
1714 alt: None,
1715 title: None,
1716 dimensions: None,
1717 image_type: ImageType::DataUri,
1718 attributes: BTreeMap::new(),
1719 };
1720 assert_eq!(data_uri.image_type, ImageType::DataUri);
1721
1722 let external = ImageMetadata {
1723 src: "https://example.com/image.jpg".to_string(),
1724 alt: None,
1725 title: None,
1726 dimensions: None,
1727 image_type: ImageType::External,
1728 attributes: BTreeMap::new(),
1729 };
1730 assert_eq!(external.image_type, ImageType::External);
1731 }
1732
1733 #[test]
1734 fn test_link_type_display() {
1735 assert_eq!(LinkType::Anchor.to_string(), "anchor");
1736 assert_eq!(LinkType::Internal.to_string(), "internal");
1737 assert_eq!(LinkType::External.to_string(), "external");
1738 assert_eq!(LinkType::Email.to_string(), "email");
1739 assert_eq!(LinkType::Phone.to_string(), "phone");
1740 assert_eq!(LinkType::Other.to_string(), "other");
1741 }
1742
1743 #[test]
1744 fn test_structured_data_type_display() {
1745 assert_eq!(StructuredDataType::JsonLd.to_string(), "json_ld");
1746 assert_eq!(StructuredDataType::Microdata.to_string(), "microdata");
1747 assert_eq!(StructuredDataType::RDFa.to_string(), "rdfa");
1748 }
1749
1750 #[test]
1751 fn test_categorize_links() {
1752 let config = MetadataConfig::default();
1753 let mut collector = MetadataCollector::new(config);
1754
1755 collector.add_link("#anchor".to_string(), "Anchor".to_string(), None, None, BTreeMap::new());
1756 collector.add_link(
1757 "https://example.com".to_string(),
1758 "External".to_string(),
1759 None,
1760 None,
1761 BTreeMap::new(),
1762 );
1763 collector.add_link(
1764 "mailto:test@example.com".to_string(),
1765 "Email".to_string(),
1766 None,
1767 None,
1768 BTreeMap::new(),
1769 );
1770
1771 let categorized = collector.categorize_links();
1772
1773 assert_eq!(categorized.get("anchor").map(|v| v.len()), Some(1));
1774 assert_eq!(categorized.get("external").map(|v| v.len()), Some(1));
1775 assert_eq!(categorized.get("email").map(|v| v.len()), Some(1));
1776 }
1777
1778 #[test]
1779 fn test_header_counts() {
1780 let config = MetadataConfig::default();
1781 let mut collector = MetadataCollector::new(config);
1782
1783 collector.add_header(1, "H1".to_string(), None, 0, 100);
1784 collector.add_header(2, "H2".to_string(), None, 1, 200);
1785 collector.add_header(2, "H2b".to_string(), None, 1, 300);
1786 collector.add_header(3, "H3".to_string(), None, 2, 400);
1787
1788 let counts = collector.header_counts();
1789
1790 assert_eq!(counts.get("1").copied(), Some(1));
1791 assert_eq!(counts.get("2").copied(), Some(2));
1792 assert_eq!(counts.get("3").copied(), Some(1));
1793 }
1794}