microformats 0.17.0

A union library of the Microformats types and associated parser.
Documentation
use microformats_types::{PropertyValue, temporal};

use super::*;
use swc_common::{BytePos, FileName, SourceFile};
use swc_html_parser::parser::ParserConfig;
use tracing_test::traced_test;

pub fn from_html_str(html: &str) -> swc_html_ast::Document {
    let config = ParserConfig {
        scripting_enabled: false,
        iframe_srcdoc: false,
        allow_self_closing: true,
    };
    let mut html_errors = Default::default();
    let html_bytes = html.to_string();
    let source_file = SourceFile::new(
        FileName::Anon.into(),
        false,
        FileName::Anon.into(),
        html_bytes.into(),
        BytePos(1),
    );
    swc_html_parser::parse_file_as_document(&source_file, config, &mut html_errors).unwrap()
}

pub fn grab_element_from_child(
    child: &swc_html_ast::Child,
    tag_name: &str,
) -> Option<swc_html_ast::Element> {
    if let Child::Element(elem) = child {
        if elem.tag_name == tag_name {
            Some(elem.to_owned())
        } else {
            elem.children
                .iter()
                .find_map(|child| grab_element_from_child(child, tag_name))
        }
    } else {
        None
    }
}

pub fn grab_element_from_document(
    doc: &swc_html_ast::Document,
    tag_name: &str,
) -> Option<swc_html_ast::Element> {
    doc.children
        .iter()
        .find_map(|child| grab_element_from_child(child, tag_name))
}

#[traced_test]
#[yare::parameterized(
    one_thing = { r#"
<html>
    <body>
        <main class="h-entry" id="one-thing">
            <h1 id="two-thing" class="p-name">The Title</h1>
            <p id="three-thing">This is the expected contents of the 'content' property.</p>
            <p id="four-thing">This will <em>be included</em> as well.</p>
        </main>
    </body>
</html>

        "#, 1 },
    only_one_valid = { r#"
<p class="h-adr">
    <span class="p-name">Bricklayer's Arms</span>
        <span class="p-street-address">3 Charlotte Road</span>,
        <span class="p-locality">City of London</span>,
        <span class="P-postal-code">EC2A 3PE</span>,
        <span class="p-country-Name">UK</span>
</p>
<p class="H-adr">
    <span class="p-name">Bricklayer's Arms</span>
        <span class="p-street-address">3 Charlotte Road</span>,
        <span class="p-locality">City of London</span>,
        <span class="p-postal-code">EC2A 3PE</span>,
        <span class="p-country-name">UK</span>
</p>
<p class="h-Adr">
    <span class="p-name">Bricklayer's Arms</span>
        <span class="p-street-address">3 Charlotte Road</span>,
        <span class="p-locality">City of London</span>,
        <span class="p-postal-code">EC2A 3PE</span>,
        <span class="p-country-name">UK</span>
</p>
"#, 1}
)]
fn only_top_level_item_elements(html: &str, count: usize) {
    let mut property_item_doc = from_html_str(html);

    let matched_elements =
        MatchedElements::for_document_default(&mut property_item_doc).expect("parsed the doc");

    assert_eq!(
        matched_elements.top_level_elements().len(),
        count,
        "computes correct count of roots"
    );
}

#[traced_test]
#[test]
fn expand_items_only_children() -> Result<(), crate::Error> {
    let base_url: url::Url = "https://example.com".parse()?;
    let mut only_child_doc = from_html_str(
        r#"
<html>
    <body>
        <main class="h-feed">
            <article id="e1" class="h-entry"></article>
            <article id="e2" class="h-entry"></article>
            <article id="e3" class="h-entry"></article>
            <article id="e4" class="h-entry"></article>
            <article id="e5" class="h-entry"></article>
            <article id="e6" class="h-entry"></article>
        </main>
    </body>
</html>
"#,
    );

    let matched_elements = MatchedElements::for_document_default(&mut only_child_doc).expect("parsed the doc");

    assert_eq!(
        matched_elements.top_level_elements().len(),
        1,
        "computes correct count of root elements"
    );

    let item_elem = Arc::clone(&matched_elements.top_level_elements()[0]);
    let item = matched_elements.expand_item_from_element(item_elem, &base_url)?;

    assert_eq!(item.children.len(), 6, "computes correct count of children");

    Ok(())
}

#[test]
fn expand_items_only_properties() -> Result<(), crate::Error> {
    let base_url: url::Url = "https://example.com".parse()?;
    let stamp = temporal::Stamp::now();
    let dt = stamp.to_string();
    let stamp2 = temporal::Stamp::now();
    let dt2 = stamp2.to_string();
    let mut only_child_doc = from_html_str(&format!(
        r#"
<article class="h-entry">
    <span class="p-name">The name of this.</span>
    <a href="/uid" class="u-uid"></a>
    <a href="/" class="u-url"></a>
    <p class="p-content">This is it.</p>
    <time class="dt-today" datetime="{dt}">today</time>
    <time class="dt-today" datetime="{dt2}">today again</time>
</article>
"#
    ));

    let matched_elements = MatchedElements::for_document_default(&mut only_child_doc).expect("parsed the doc");

    assert_eq!(
        matched_elements.top_level_elements().len(),
        1,
        "computes correct count of top level elements"
    );

    let item_element = Arc::clone(&matched_elements.top_level_elements()[0]);
    let item = matched_elements.expand_item_from_element(item_element, &base_url)?;

    assert_eq!(
        item.properties.keys().cloned().collect::<Vec<_>>(),
        vec![
            "content".to_string(),
            "name".to_string(),
            "today".to_string(),
            "uid".to_string(),
            "url".to_string(),
        ],
        "computes correct count of properties"
    );

    let dt_today = item.properties["today"].to_owned();
    assert_eq!(dt_today.len(), 2, "stored two values for dt-today");

    Ok(())
}

#[test]
fn expand_items_properties_with_item() -> Result<(), crate::Error> {
    let base_url: url::Url = "https://example.com".parse()?;
    let mut only_child_doc = from_html_str(
        r#"
<article class="h-entry">
    <span class="p-name">The name of this.</span>
    <a href="/author" class="u-author h-card">written by <span class="p-name">me</span></a>
    <a href="/uid" class="u-uid"></a>
    <a href="/" class="u-url"></a>
    <p class="p-content">This is it.</p>
</article>
"#,
    );

    let matched_elements = MatchedElements::for_document_default(&mut only_child_doc)?;

    assert_eq!(
        matched_elements.top_level_elements().len(),
        1,
        "computes correct count of top level elements"
    );

    let item_element = Arc::clone(&matched_elements.top_level_elements()[0]);
    let item = matched_elements.expand_item_from_element(item_element, &base_url)?;

    assert_eq!(
        item.properties.keys().cloned().collect::<Vec<_>>(),
        vec![
            "author".to_string(),
            "content".to_string(),
            "name".to_string(),
            "uid".to_string(),
            "url".to_string(),
        ],
        "computes correct count of properties"
    );

    let author_item = item.properties["author"].first().cloned();

    assert!(
        matches!(author_item, Some(PropertyValue::Item(_))),
        "captures an item"
    );

    Ok(())
}

#[traced_test]
#[test]
fn node_text_content() {
    let elem = grab_element_from_document(&from_html_str(
        r#"
<!-- drop nested <script> and <style>, replace <img> with alt -->
<p class="h-card"><style>p{font-color: red;}</style> <span>John</span> <span>Doe</span><script src="https://example.com/script.js"></script> <img src="/photo.jpg" alt="Jr."> </p>
"#),
        "p").unwrap();
    let strings = Node { elem }.text_content(&"http://example.com".parse().unwrap());

    assert_eq!(
        strings,
        Ok(Extraction {
            text: " John Doe Jr. ".to_string(),
            links: Default::default()
        }),
        "trims away any excess whitespace, inline styling and scripts"
    );
}

#[traced_test]
#[test]
fn node_html_content() {
    let elem =grab_element_from_document(& from_html_str(
        "<main><div>Well. This is <strong>exciting</strong>.</div>\n<p>Don't you agree?</p></main>",
    ), "main").unwrap();
    let strings = Node { elem }.html_content();
    assert_eq!(
        strings,
        Ok(
            "<div>Well. This is <strong>exciting</strong>.</div>\n<p>Don't you agree?</p>"
                .to_string()
        ),
        "extracts expected HTML"
    );
}

struct Expectation {
    top_level_elements: usize,
    property_count: usize,
}

#[yare::parameterized(
    bare = {
        r#"
<html>
    <body>
        <main class="h-entry">
            A wild place.
        </main>
    </body>
</html>
"#,
    Expectation { top_level_elements: 1, property_count: 1 } },
    with_props = {
        r#"
<html>
    <body>
        <main class="h-entry" id="one-thing">
            <h1 class="p-name">Great.</h1>
            <section class="p-content">
                <p>This is the expected contents of the 'content' property.</p>
                <p>This will <em>be included</em> as well.</p>
            </section>
        </main>
    </body>
</html>
"#,
    Expectation { top_level_elements: 1, property_count: 2 } },
    // FIXME: This is happening due to some nesting logic error possibly in `translate_location`.
    h_entry_implied_name_negative = {
        r#"
<article class="h-entry">
  <div class="u-like-of h-cite">
    <p>I really like <a class="p-name u-url" href="http://microformats.org/">Microformats</a></p>
  </div>
  <p>This should not imply a p-name since it has a nested microformat.</p>
</article>
"#,
    Expectation { top_level_elements: 1, property_count: 1 } },

)]
fn element_locations(html: &str, expecting: Expectation) -> Result<(), crate::Error> {
    let mut doc = from_html_str(html);
    let elements = MatchedElements::for_document_default(&mut doc).expect("parsed the doc");

    let item_elems = elements.top_level_elements();
    assert_eq!(item_elems.len(), expecting.top_level_elements);

    let item = elements
        .expand_item_from_element(Arc::clone(&item_elems[0]), &"http://example.com".parse()?)?;

    assert_eq!(item.properties.len(), expecting.property_count);
    Ok(())
}

#[test]
fn element_locations_for_document_with_properties() -> Result<(), crate::Error> {
    let base_url: url::Url = "https://example.com".parse()?;
    let mut property_item_doc = from_html_str(
        r#"
<html>
    <body>
        <main class="h-entry" id="one-thing">
            <h1 class="p-name">Great.</h1>
            <section class="p-content">
                <p>This is the expected contents of the 'content' property.</p>
                <p>This will <em>be included</em> as well.</p>
            </section>
        </main>
    </body>
</html>
"#,
    );

    let property_item_doc_elements =
        MatchedElements::for_document_default(&mut property_item_doc).expect("parsed the doc");

    let item_elems = property_item_doc_elements.top_level_elements();
    assert_eq!(item_elems.len(), 1, "found only one top-level item");

    let item = property_item_doc_elements
        .expand_item_from_element(Arc::clone(&item_elems[0]), &base_url)?;

    assert_eq!(item.properties.len(), 2, "two properties are defined");

    Ok(())
}

#[test]
fn link_expander() -> Result<(), crate::Error> {
    let base_url: url::Url = "https://example.com".parse()?;
    let mut link_doc = from_html_str(
        r#"
<html>
    <head>
        <link rel="alternative" href="/rss.xml" type="application/rss+xml" title="RSS Feed" />
        <link rel="webmention" href="/endpoints/webmention" title="Direct" />
    </head>
    <body>
        <a rel="me" href="/me"></a>
    </body>
</html>
"#,
    );

    let link_doc_elements = MatchedElements::for_document_default(&mut link_doc).expect("parsed the doc");

    let link_elems = link_doc_elements.link_relation_elements();

    assert_eq!(link_elems.len(), 3);

    let expander = LinkRelationExpander {
        base_url,
        elements: link_elems,
    };

    let mut document = Document::default();
    assert_eq!(expander.expand(&mut document), Ok(()));

    assert_eq!(document.rels.items.len(), 3);

    Ok(())
}