use microformats_types::{PropertyValue, temporal};
use super::*;
use swc_common::{BytePos, FileName, SourceFile};
use swc_html_parser::parser::ParserConfig;
use tracing_test::traced_test;
pub fn from_html_str(html: &str) -> swc_html_ast::Document {
let config = ParserConfig {
scripting_enabled: false,
iframe_srcdoc: false,
allow_self_closing: true,
};
let mut html_errors = Default::default();
let html_bytes = html.to_string();
let source_file = SourceFile::new(
FileName::Anon.into(),
false,
FileName::Anon.into(),
html_bytes.into(),
BytePos(1),
);
swc_html_parser::parse_file_as_document(&source_file, config, &mut html_errors).unwrap()
}
pub fn grab_element_from_child(
child: &swc_html_ast::Child,
tag_name: &str,
) -> Option<swc_html_ast::Element> {
if let Child::Element(elem) = child {
if elem.tag_name == tag_name {
Some(elem.to_owned())
} else {
elem.children
.iter()
.find_map(|child| grab_element_from_child(child, tag_name))
}
} else {
None
}
}
pub fn grab_element_from_document(
doc: &swc_html_ast::Document,
tag_name: &str,
) -> Option<swc_html_ast::Element> {
doc.children
.iter()
.find_map(|child| grab_element_from_child(child, tag_name))
}
#[traced_test]
#[yare::parameterized(
one_thing = { r#"
<html>
<body>
<main class="h-entry" id="one-thing">
<h1 id="two-thing" class="p-name">The Title</h1>
<p id="three-thing">This is the expected contents of the 'content' property.</p>
<p id="four-thing">This will <em>be included</em> as well.</p>
</main>
</body>
</html>
"#, 1 },
only_one_valid = { r#"
<p class="h-adr">
<span class="p-name">Bricklayer's Arms</span>
<span class="p-street-address">3 Charlotte Road</span>,
<span class="p-locality">City of London</span>,
<span class="P-postal-code">EC2A 3PE</span>,
<span class="p-country-Name">UK</span>
</p>
<p class="H-adr">
<span class="p-name">Bricklayer's Arms</span>
<span class="p-street-address">3 Charlotte Road</span>,
<span class="p-locality">City of London</span>,
<span class="p-postal-code">EC2A 3PE</span>,
<span class="p-country-name">UK</span>
</p>
<p class="h-Adr">
<span class="p-name">Bricklayer's Arms</span>
<span class="p-street-address">3 Charlotte Road</span>,
<span class="p-locality">City of London</span>,
<span class="p-postal-code">EC2A 3PE</span>,
<span class="p-country-name">UK</span>
</p>
"#, 1}
)]
fn only_top_level_item_elements(html: &str, count: usize) {
let mut property_item_doc = from_html_str(html);
let matched_elements =
MatchedElements::for_document_default(&mut property_item_doc).expect("parsed the doc");
assert_eq!(
matched_elements.top_level_elements().len(),
count,
"computes correct count of roots"
);
}
#[traced_test]
#[test]
fn expand_items_only_children() -> Result<(), crate::Error> {
let base_url: url::Url = "https://example.com".parse()?;
let mut only_child_doc = from_html_str(
r#"
<html>
<body>
<main class="h-feed">
<article id="e1" class="h-entry"></article>
<article id="e2" class="h-entry"></article>
<article id="e3" class="h-entry"></article>
<article id="e4" class="h-entry"></article>
<article id="e5" class="h-entry"></article>
<article id="e6" class="h-entry"></article>
</main>
</body>
</html>
"#,
);
let matched_elements = MatchedElements::for_document_default(&mut only_child_doc).expect("parsed the doc");
assert_eq!(
matched_elements.top_level_elements().len(),
1,
"computes correct count of root elements"
);
let item_elem = Arc::clone(&matched_elements.top_level_elements()[0]);
let item = matched_elements.expand_item_from_element(item_elem, &base_url)?;
assert_eq!(item.children.len(), 6, "computes correct count of children");
Ok(())
}
#[test]
fn expand_items_only_properties() -> Result<(), crate::Error> {
let base_url: url::Url = "https://example.com".parse()?;
let stamp = temporal::Stamp::now();
let dt = stamp.to_string();
let stamp2 = temporal::Stamp::now();
let dt2 = stamp2.to_string();
let mut only_child_doc = from_html_str(&format!(
r#"
<article class="h-entry">
<span class="p-name">The name of this.</span>
<a href="/uid" class="u-uid"></a>
<a href="/" class="u-url"></a>
<p class="p-content">This is it.</p>
<time class="dt-today" datetime="{dt}">today</time>
<time class="dt-today" datetime="{dt2}">today again</time>
</article>
"#
));
let matched_elements = MatchedElements::for_document_default(&mut only_child_doc).expect("parsed the doc");
assert_eq!(
matched_elements.top_level_elements().len(),
1,
"computes correct count of top level elements"
);
let item_element = Arc::clone(&matched_elements.top_level_elements()[0]);
let item = matched_elements.expand_item_from_element(item_element, &base_url)?;
assert_eq!(
item.properties.keys().cloned().collect::<Vec<_>>(),
vec![
"content".to_string(),
"name".to_string(),
"today".to_string(),
"uid".to_string(),
"url".to_string(),
],
"computes correct count of properties"
);
let dt_today = item.properties["today"].to_owned();
assert_eq!(dt_today.len(), 2, "stored two values for dt-today");
Ok(())
}
#[test]
fn expand_items_properties_with_item() -> Result<(), crate::Error> {
let base_url: url::Url = "https://example.com".parse()?;
let mut only_child_doc = from_html_str(
r#"
<article class="h-entry">
<span class="p-name">The name of this.</span>
<a href="/author" class="u-author h-card">written by <span class="p-name">me</span></a>
<a href="/uid" class="u-uid"></a>
<a href="/" class="u-url"></a>
<p class="p-content">This is it.</p>
</article>
"#,
);
let matched_elements = MatchedElements::for_document_default(&mut only_child_doc)?;
assert_eq!(
matched_elements.top_level_elements().len(),
1,
"computes correct count of top level elements"
);
let item_element = Arc::clone(&matched_elements.top_level_elements()[0]);
let item = matched_elements.expand_item_from_element(item_element, &base_url)?;
assert_eq!(
item.properties.keys().cloned().collect::<Vec<_>>(),
vec![
"author".to_string(),
"content".to_string(),
"name".to_string(),
"uid".to_string(),
"url".to_string(),
],
"computes correct count of properties"
);
let author_item = item.properties["author"].first().cloned();
assert!(
matches!(author_item, Some(PropertyValue::Item(_))),
"captures an item"
);
Ok(())
}
#[traced_test]
#[test]
fn node_text_content() {
let elem = grab_element_from_document(&from_html_str(
r#"
<!-- drop nested <script> and <style>, replace <img> with alt -->
<p class="h-card"><style>p{font-color: red;}</style> <span>John</span> <span>Doe</span><script src="https://example.com/script.js"></script> <img src="/photo.jpg" alt="Jr."> </p>
"#),
"p").unwrap();
let strings = Node { elem }.text_content(&"http://example.com".parse().unwrap());
assert_eq!(
strings,
Ok(Extraction {
text: " John Doe Jr. ".to_string(),
links: Default::default()
}),
"trims away any excess whitespace, inline styling and scripts"
);
}
#[traced_test]
#[test]
fn node_html_content() {
let elem =grab_element_from_document(& from_html_str(
"<main><div>Well. This is <strong>exciting</strong>.</div>\n<p>Don't you agree?</p></main>",
), "main").unwrap();
let strings = Node { elem }.html_content();
assert_eq!(
strings,
Ok(
"<div>Well. This is <strong>exciting</strong>.</div>\n<p>Don't you agree?</p>"
.to_string()
),
"extracts expected HTML"
);
}
struct Expectation {
top_level_elements: usize,
property_count: usize,
}
#[yare::parameterized(
bare = {
r#"
<html>
<body>
<main class="h-entry">
A wild place.
</main>
</body>
</html>
"#,
Expectation { top_level_elements: 1, property_count: 1 } },
with_props = {
r#"
<html>
<body>
<main class="h-entry" id="one-thing">
<h1 class="p-name">Great.</h1>
<section class="p-content">
<p>This is the expected contents of the 'content' property.</p>
<p>This will <em>be included</em> as well.</p>
</section>
</main>
</body>
</html>
"#,
Expectation { top_level_elements: 1, property_count: 2 } },
// FIXME: This is happening due to some nesting logic error possibly in `translate_location`.
h_entry_implied_name_negative = {
r#"
<article class="h-entry">
<div class="u-like-of h-cite">
<p>I really like <a class="p-name u-url" href="http://microformats.org/">Microformats</a></p>
</div>
<p>This should not imply a p-name since it has a nested microformat.</p>
</article>
"#,
Expectation { top_level_elements: 1, property_count: 1 } },
)]
fn element_locations(html: &str, expecting: Expectation) -> Result<(), crate::Error> {
let mut doc = from_html_str(html);
let elements = MatchedElements::for_document_default(&mut doc).expect("parsed the doc");
let item_elems = elements.top_level_elements();
assert_eq!(item_elems.len(), expecting.top_level_elements);
let item = elements
.expand_item_from_element(Arc::clone(&item_elems[0]), &"http://example.com".parse()?)?;
assert_eq!(item.properties.len(), expecting.property_count);
Ok(())
}
#[test]
fn element_locations_for_document_with_properties() -> Result<(), crate::Error> {
let base_url: url::Url = "https://example.com".parse()?;
let mut property_item_doc = from_html_str(
r#"
<html>
<body>
<main class="h-entry" id="one-thing">
<h1 class="p-name">Great.</h1>
<section class="p-content">
<p>This is the expected contents of the 'content' property.</p>
<p>This will <em>be included</em> as well.</p>
</section>
</main>
</body>
</html>
"#,
);
let property_item_doc_elements =
MatchedElements::for_document_default(&mut property_item_doc).expect("parsed the doc");
let item_elems = property_item_doc_elements.top_level_elements();
assert_eq!(item_elems.len(), 1, "found only one top-level item");
let item = property_item_doc_elements
.expand_item_from_element(Arc::clone(&item_elems[0]), &base_url)?;
assert_eq!(item.properties.len(), 2, "two properties are defined");
Ok(())
}
#[test]
fn link_expander() -> Result<(), crate::Error> {
let base_url: url::Url = "https://example.com".parse()?;
let mut link_doc = from_html_str(
r#"
<html>
<head>
<link rel="alternative" href="/rss.xml" type="application/rss+xml" title="RSS Feed" />
<link rel="webmention" href="/endpoints/webmention" title="Direct" />
</head>
<body>
<a rel="me" href="/me"></a>
</body>
</html>
"#,
);
let link_doc_elements = MatchedElements::for_document_default(&mut link_doc).expect("parsed the doc");
let link_elems = link_doc_elements.link_relation_elements();
assert_eq!(link_elems.len(), 3);
let expander = LinkRelationExpander {
base_url,
elements: link_elems,
};
let mut document = Document::default();
assert_eq!(expander.expand(&mut document), Ok(()));
assert_eq!(document.rels.items.len(), 3);
Ok(())
}