microformats 0.18.2

A union library of the Microformats types and associated parser.
Documentation
//! Example: Finding the Representative h-card of a Page
//!
//! This example demonstrates how to find the representative h-card of a page
//! using the algorithm defined at https://microformats.org/wiki/representative-h-card-parsing
//!
//! The algorithm:
//! 1. If the page contains an h-card with `uid` AND `url` properties both matching the page URL
//! 2. If no match, check for h-card with `url` property that also has `rel=me` relation
//! 3. If no match, check for exactly ONE h-card with `url` property matching page URL
//! 4. If no match, the page has no representative h-card
//!
//! # Usage
//! ```bash
//! cargo run --example representative_hcard
//! ```

use microformats::types::{Class, KnownClass, PropertyValue};
use std::collections::BTreeSet;
use url::Url;

/// Find the representative h-card from a parsed document.
///
/// Implements the algorithm from https://microformats.org/wiki/representative-h-card-parsing
fn find_representative_hcard(
    document: &microformats::types::Document,
    page_url: &Url,
) -> Option<microformats::types::Item> {
    // Get all h-cards on the page
    let hcards: Vec<_> = document
        .items
        .iter()
        .filter(|item| {
            item.r#type
                .iter()
                .any(|t| matches!(t, Class::Known(KnownClass::Card)))
        })
        .collect();

    if hcards.is_empty() {
        return None;
    }

    // Step 1: h-card with uid AND url both matching page URL
    for hcard in &hcards {
        let uid_matches = hcard
            .properties
            .get("uid")
            .and_then(|values| values.first())
            .map(|v| property_value_matches_url(v, page_url))
            .unwrap_or(false);

        let url_matches = hcard
            .properties
            .get("url")
            .and_then(|values| values.first())
            .map(|v| property_value_matches_url(v, page_url))
            .unwrap_or(false);

        if uid_matches && url_matches {
            return Some((*hcard).clone());
        }
    }

    // Step 2: h-card with url that also has rel=me
    let me_urls: BTreeSet<Url> = document
        .rels
        .by_rels()
        .get("me")
        .map(|urls| urls.iter().cloned().collect())
        .unwrap_or_default();

    for hcard in &hcards {
        if let Some(url_values) = hcard.properties.get("url") {
            for value in url_values {
                if let Some(hcard_url) = property_value_as_url(value) {
                    if me_urls.contains(&hcard_url) {
                        return Some((*hcard).clone());
                    }
                }
            }
        }
    }

    // Step 3: If exactly one h-card with url matching page URL
    if hcards.len() == 1 {
        let hcard = hcards[0];
        let url_matches = hcard
            .properties
            .get("url")
            .and_then(|values| values.first())
            .map(|v| property_value_matches_url(v, page_url))
            .unwrap_or(false);

        if url_matches {
            return Some(hcard.clone());
        }
    }

    // No representative h-card found
    None
}

/// Check if a property value matches a URL (after normalization)
fn property_value_matches_url(value: &PropertyValue, url: &Url) -> bool {
    match value {
        PropertyValue::Url(u) => urls_match(u, url),
        PropertyValue::Plain(s) => Url::parse(s)
            .map(|parsed| urls_match(&parsed, url))
            .unwrap_or(false),
        _ => false,
    }
}

/// Extract URL from property value if present
fn property_value_as_url(value: &PropertyValue) -> Option<Url> {
    match value {
        PropertyValue::Url(u) => Some((**u).clone()),
        PropertyValue::Plain(s) => Url::parse(s).ok(),
        _ => None,
    }
}

/// Check if two URLs match (normalized comparison)
fn urls_match(a: &Url, b: &Url) -> bool {
    // Normalize URLs for comparison
    // Both must have same scheme, host, port, path, query, fragment
    a.scheme() == b.scheme()
        && a.host() == b.host()
        && a.port() == b.port()
        && a.path() == b.path()
        && a.query() == b.query()
        && a.fragment() == b.fragment()
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let html = r#"
    <!DOCTYPE html>
    <html>
    <head>
        <link rel="me" href="https://example.com/">
    </head>
    <body>
        <div class="h-card">
            <a class="u-url u-uid" href="https://example.com/">
                <span class="p-name">Jane Doe</span>
            </a>
            <span class="p-email">jane@example.com</span>
        </div>
        <article class="h-entry">
            <h1 class="p-name">A Blog Post</h1>
        </article>
    </body>
    </html>
    "#;

    let page_url: Url = "https://example.com/".parse()?;
    let document = microformats::from_html(html, &page_url)?;

    match find_representative_hcard(&document, &page_url) {
        Some(hcard) => {
            println!("Found representative h-card!");
            if let Some(name) = hcard.properties.get("name").and_then(|v| v.first()) {
                println!("  Name: {:?}", name);
            }
            if let Some(url) = hcard.properties.get("url").and_then(|v| v.first()) {
                println!("  URL: {:?}", url);
            }
            if let Some(email) = hcard.properties.get("email").and_then(|v| v.first()) {
                println!("  Email: {:?}", email);
            }
        }
        None => {
            println!("No representative h-card found on this page.");
        }
    }

    Ok(())
}