microformats-types 0.15.0

A representation of the known objects of Microformats
Documentation
use super::*;
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, HashMap};

/// Debug information for a parsed document.
///
/// This struct contains both the parsed microformat data and detailed information
/// about which HTML elements contributed to each parsed value. This is useful for:
/// - **Debugging**: Understanding why certain values were or weren't parsed
/// - **Validation**: Verifying your markup produces the expected structure
/// - **Learning**: Understanding how microformats parsing works
/// - **Development**: Writing tests and debugging custom implementations
///
/// # Example
///
/// ```ignore
/// use microformats::from_html;
/// let base_url: url::Url = "https://example.com".parse().unwrap();
/// let mut parser = microformats::parse::Parser::from_html(html)?;
/// parser = parser.with_id_generation(true);
/// let doc = parser.into_document(Some(base_url))?;
///
/// if let Some(debug_doc) = doc.into_debug_document() {
///     // Inspect which elements contributed to parsing
///     for item in &debug_doc.value_sources.items {
///         println!("Item type: {:?}", item.r#type);
///         for (prop, sources) in &item.properties {
///             for source in sources {
///                 println!("  Property '{}' from element {}", prop, source.element.mf2_id.unwrap_or("unknown"));
///             }
///         }
///     }
/// }
/// ```
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct DebugDocument {
    /// The parsed document containing all microformat items and relations
    pub document: Document,
    /// Mapping of values to their source elements with detailed debug info
    pub value_sources: ValueSourceMap,
    /// HTML with debug markers (data-mf2-id attributes) for visual inspection
    pub annotated_html: String,
}

/// Mapping structure for all value sources.
///
/// This struct organizes debug information into categories:
/// - Top-level microformat items
/// - Relation URLs and their sources
///
/// This structure makes it easy to find debug info for specific items or properties.
#[derive(Clone, Debug, PartialEq, Default, Serialize, Deserialize)]
pub struct ValueSourceMap {
    /// Debug information for each top-level microformat item
    pub items: Vec<ItemDebugInfo>,
    /// Debug information for relation URLs
    pub relations: BTreeMap<url::Url, RelationDebugInfo>,
}

/// Debug information for a microformat item.
///
/// Contains detailed information about which HTML element created this microformat
/// item and all of its properties. This helps trace the entire parsing process
/// for a single microformat item.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ItemDebugInfo {
    /// The microformat type(s) this item represents (h-entry, h-card, etc.)
    pub r#type: Vec<Class>,
    /// The HTML element that triggered creation of this item
    pub element: ElementSource,
    /// Debug information for each property of this item
    pub properties: BTreeMap<String, Vec<PropertyDebugInfo>>,
    /// Debug information for any child microformat items
    pub children: Vec<ItemDebugInfo>,
}

/// Debug information for a property value.
///
/// Contains both the extracted property value and detailed information
/// about which HTML element provided this value. This is the atomic unit
/// of debug information - each individual property value gets its own record.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct PropertyDebugInfo {
    /// The actual property value that was parsed and extracted
    pub value: PropertyValue,
    /// Detailed information about the source element that provided this value
    pub element: ElementSource,
}

/// Debug information for relation URLs.
///
/// Relations are URL-based connections (like rel="me" links) that are parsed
/// separately from microformat items. This struct tracks which HTML elements
/// contributed to specific relation URLs.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct RelationDebugInfo {
    /// The actual URL values that were parsed for this relation
    pub urls: Vec<String>,
    /// HTML elements that contributed to these relation URLs
    pub elements: Vec<ElementSource>,
}

/// Detailed information about a source HTML element.
///
/// This struct contains comprehensive information about an HTML element that
/// contributed to microformat parsing, including its position, attributes,
/// and context. This is the foundational data structure for understanding
/// which elements were used during parsing.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct ElementSource {
    /// Unique identifier assigned by the parser (data-mf2-id attribute).
    /// This allows linking debug information to specific HTML elements
    /// in the annotated HTML output.
    pub mf2_id: Option<String>,
    /// The HTML tag name that contributed to parsing (e.g., "a", "div", "span").
    /// Useful for understanding which element types are being used.
    pub tag: String,
    /// All CSS class names present on the element, not just microformat classes.
    /// This helps identify semantic markup and styling information.
    pub classes: Vec<String>,
    /// Complete map of all HTML attributes and their values.
    /// This includes href, src, datetime, title, and other relevant attributes.
    pub attributes: BTreeMap<String, String>,
    /// Source code position information for this element.
    /// Includes line number, column, and byte offset for precise debugging.
    pub position: SourcePosition,
    /// Chain of parent element IDs leading to this element.
    /// This provides context about the element's place in the DOM hierarchy.
    pub parent_ids: Vec<String>,
}

/// Position information in source HTML.
///
/// Tracks the precise location of an HTML element in the source document,
/// enabling accurate debugging and error reporting. This is particularly
/// useful when debugging parsing issues or generating annotated HTML output.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct SourcePosition {
    /// Line number in the source HTML document (1-based indexing).
    /// Most text editors display line numbers starting from 1.
    pub line: usize,
    /// Column number within the line (1-based indexing).
    /// Helps pinpoint the exact position of an element within its line.
    pub column: usize,
    /// Byte offset from the start of the document.
    /// Useful for binary-level accuracy and fast seeking in large documents.
    pub offset: usize,
}

#[derive(Clone, Debug, Default, PartialEq, Eq, serde::Serialize)]
pub struct DebugContext {
    /// Collection of all element information collected during parsing.
    /// Maps element IDs to their detailed debug information.
    pub elements: HashMap<String, ElementSource>,
    /// Records of which elements contributed to which property values.
    /// Each record contains the element ID, property name, and value path.
    pub property_sources: Vec<PropertySourceRecord>,
    /// Original HTML string before parsing.
    /// Used to generate annotated HTML output with data-mf2-id attributes.
    pub original_html: String,
}

#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct PropertySourceRecord {
    /// JSONPath-like path to value
    pub path: String,
    /// Element ID that produced this value
    pub element_id: String,
    /// Property name
    pub property_name: String,
}

impl Document {
    /// Convert this document into a DebugDocument with source tracking information.
    ///
    /// This method enables detailed debugging of the microformats parsing process
    /// by providing access to debug information that was collected during parsing.
    ///
    /// **Prerequisites**: The document must have been parsed with debug tracking
    /// enabled, which requires calling `Parser::with_id_generation(true)` before
    /// parsing. If debug tracking was not enabled, this method returns `None`.
    ///
    /// # Returns
    ///
    /// - `Some(DebugDocument)` if debug info was collected during parsing
    /// - `None` if the document was parsed without debug tracking
    ///
    /// # Usage Example
    ///
    /// ```ignore
    /// use microformats::from_html;
    /// let base_url: url::Url = "https://example.com".parse().unwrap();
    /// let mut parser = microformats::parse::Parser::from_html(html)?;
    /// parser = parser.with_id_generation(true);  // Enable debug tracking
    /// let doc = parser.into_document(Some(base_url))?;
    ///
    /// if let Some(debug_doc) = doc.into_debug_document() {
    ///     // Inspect which elements contributed to parsing
    ///     for item in &debug_doc.value_sources.items {
    ///         println!("Found {} item", item.r#type[0]);
    ///         for (prop, sources) in &item.properties {
    ///             for source in sources {
    ///                 println!("  Property '{}' from element {}:{}",
    ///                     prop,
    ///                     source.element.position.line,
    ///                     source.element.position.column
    ///                 );
    ///             }
    ///         }
    ///     }
    /// }
    /// ```
    ///
    /// # Use Cases
    ///
    /// - **Debugging parsing issues**: See exactly which elements were used
    /// - **Validating markup**: Verify your microformat markup is correct
    /// - **Learning**: Understand how the parser interprets different HTML structures
    /// - **Testing**: Write comprehensive tests that verify element usage
    pub fn into_debug_document(self) -> Option<DebugDocument> {
        let debug_ctx = self._debug_context.clone()?;

        // Build value sources from debug context
        let value_sources = self.build_value_sources(&debug_ctx)?;

        // Generate annotated HTML (data-mf2-id already added during parsing)
        let annotated_html = debug_ctx.original_html.clone();

        Some(DebugDocument {
            document: self,
            value_sources,
            annotated_html,
        })
    }

    fn build_value_sources(&self, ctx: &DebugContext) -> Option<ValueSourceMap> {
        let items = self
            .items
            .iter()
            .enumerate()
            .filter_map(|(idx, item)| {
                self.build_item_debug_info(item, &format!("items[{}]", idx), ctx)
            })
            .collect();

        let relations = self.build_relation_debug_info(ctx)?;

        Some(ValueSourceMap { items, relations })
    }

    fn build_item_debug_info(
        &self,
        item: &Item,
        path: &str,
        ctx: &DebugContext,
    ) -> Option<ItemDebugInfo> {
        // Find element for this item
        let element_id = ctx
            .property_sources
            .iter()
            .find(|r| r.path == path && r.element_id.starts_with("mf2-"))
            .map(|r| r.element_id.clone())
            .or_else(|| {
                // Try to find anonymous element
                ctx.property_sources
                    .iter()
                    .find(|r| r.path.starts_with(&format!("{}.children", path)))
                    .map(|r| r.element_id.clone())
            })?;

        let element = ctx.elements.get(&element_id)?.clone();

        // Build property debug info
        let mut properties = BTreeMap::new();
        for (prop_name, values) in &item.properties {
            let prop_path = format!("{}.{}", path, prop_name);
            let prop_debug_info: Vec<_> = values
                .iter()
                .enumerate()
                .filter_map(|(val_idx, value)| {
                    let val_path = format!("{}[{}]", prop_path, val_idx);

                    // Find element that produced this value
                    let val_element_id = ctx
                        .property_sources
                        .iter()
                        .find(|r| r.path == val_path || val_path.starts_with(&r.path))
                        .and_then(|r| ctx.elements.get(&r.element_id))
                        .and_then(|e| e.mf2_id.clone())?;

                    Some(PropertyDebugInfo {
                        value: value.clone(),
                        element: ctx.elements.get(&val_element_id)?.clone(),
                    })
                })
                .collect();

            if !prop_debug_info.is_empty() {
                properties.insert(prop_name.clone(), prop_debug_info);
            }
        }

        // Build children debug info recursively
        let children = item
            .children
            .iter()
            .filter_map(|child| {
                let child_path = format!("{}.children", path);
                self.build_item_debug_info(child, &child_path, ctx)
            })
            .collect();

        Some(ItemDebugInfo {
            r#type: item.r#type.clone(),
            element,
            properties,
            children,
        })
    }

    fn build_relation_debug_info(
        &self,
        _ctx: &DebugContext,
    ) -> Option<BTreeMap<url::Url, RelationDebugInfo>> {
        // Build relation debug info
        let mut relations = BTreeMap::new();
        for url in self.rels.items.keys() {
            relations.insert(
                url.clone(),
                RelationDebugInfo {
                    urls: vec![url.to_string()],
                    elements: vec![], // Would need additional tracking during parsing
                },
            );
        }
        Some(relations)
    }
}