use crate::parse::{Error, element::Node};
use microformats_types::{Image, PropertyValue, UrlValue};
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
use url::Url;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct PictureSource {
pub srcset: Vec<SrcSetEntry>,
#[serde(default, skip_serializing_if = "String::is_empty")]
pub media: String,
#[serde(default, skip_serializing_if = "String::is_empty")]
pub r#type: String,
#[serde(default, skip_serializing_if = "String::is_empty")]
pub sizes: String,
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub attributes: BTreeMap<String, String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct SrcSetEntry {
pub url: Url,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub width: Option<u32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub height: Option<u32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub density: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct Picture {
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub sources: Vec<PictureSource>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub fallback: Option<Image>,
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub attributes: BTreeMap<String, String>,
}
#[derive(Debug, Clone)]
pub struct PictureParserBuilder {
extract_alt_text: bool,
follow_redirects: bool,
max_file_size: Option<u64>,
}
impl PictureParserBuilder {
pub fn with_alt_text_extraction(mut self, enable: bool) -> Self {
self.extract_alt_text = enable;
self
}
pub fn with_redirect_following(mut self, enable: bool) -> Self {
self.follow_redirects = enable;
self
}
pub fn with_max_file_size(mut self, size: u64) -> Self {
self.max_file_size = Some(size);
self
}
}
#[derive(Debug, Clone)]
pub struct PictureParser {
config: PictureParserBuilder,
}
impl Default for PictureParser {
fn default() -> Self {
Self {
config: PictureParserBuilder::default(),
}
}
}
impl PictureParser {
pub fn new() -> Self {
Self::default()
}
pub fn with_config(config: PictureParserBuilder) -> Self {
Self { config }
}
pub fn parse_picture_element(
&self,
node: &Node,
base_url: &Url,
) -> Result<Option<Picture>, Error> {
self.parse_picture_element_with_sources(node, base_url)
}
pub fn parse_image_element(
&self,
node: &Node,
base_url: &Url,
) -> Result<Option<PropertyValue>, Error> {
let tag_name = node.tag();
match tag_name {
"img" => self.parse_img_element(node, base_url),
"picture" => {
if let Some(picture) = self.parse_picture_element_with_sources(node, base_url)? {
if let Some(fallback) = picture.fallback {
return Ok(Some(PropertyValue::Image(fallback)));
}
}
Ok(None)
}
"video" => self.parse_video_element(node, base_url),
_ => Ok(None),
}
}
fn parse_img_element(
&self,
node: &Node,
base_url: &Url,
) -> Result<Option<PropertyValue>, Error> {
let src_attr_value = node.attr("src").filter(|s| !s.is_empty());
let alt_attr_value = node.attr("alt");
let src_url = match src_attr_value
.as_ref()
.and_then(|attr_value| base_url.join(attr_value).ok())
{
Some(url) => url,
None => {
tracing::debug!(
"Failed to resolve URL from src attribute '{}' with base_url '{}'",
src_attr_value.unwrap_or_default(),
base_url
);
return Ok(None);
}
};
if let Some(ref max_size) = self.config.max_file_size {
tracing::debug!(
"File size limit of {} bytes set, but checking not yet implemented",
max_size
);
}
if alt_attr_value.is_some() {
Ok(Some(PropertyValue::Image(Image {
value: src_url,
alt: alt_attr_value,
})))
} else {
Ok(Some(PropertyValue::Url(UrlValue::new(src_url))))
}
}
fn parse_picture_element_with_sources(
&self,
node: &Node,
base_url: &Url,
) -> Result<Option<Picture>, Error> {
if node.tag() != "picture" {
return Ok(None);
}
let mut sources = Vec::new();
let mut fallback = None;
let mut attributes = BTreeMap::new();
attributes.extend(node.full_attribute_map());
for child_element in node.elements() {
match child_element.tag() {
"source" => {
let srcset_attr = child_element.attr("srcset").filter(|s| !s.is_empty());
let media_attr = child_element.attr("media").unwrap_or_default();
let type_attr = child_element.attr("type").unwrap_or_default();
let sizes_attr = child_element.attr("sizes").unwrap_or_default();
if let Some(srcset) = srcset_attr {
let srcset_entries = self.parse_srcset_with_base(&srcset, base_url)?;
let source = PictureSource {
srcset: srcset_entries,
media: media_attr,
r#type: type_attr,
sizes: sizes_attr,
attributes: child_element.full_attribute_map(),
};
sources.push(source);
}
}
"img" => {
if let Some(img_property) = self.parse_img_element(&child_element, base_url)? {
match img_property {
PropertyValue::Image(image) => {
fallback = Some(image);
}
PropertyValue::Url(url_value) => {
fallback = Some(Image {
value: (*url_value).clone(),
alt: None,
});
}
_ => {
}
}
}
}
_ => {
continue;
}
}
}
let picture = Picture {
sources,
fallback,
attributes,
};
Ok(Some(picture))
}
fn parse_video_element(
&self,
_node: &Node,
_base_url: &Url,
) -> Result<Option<PropertyValue>, Error> {
if let Some(poster_url) = _node.attr("poster").filter(|s| !s.is_empty()) {
if let Ok(resolved_url) = _base_url.join(&poster_url) {
return Ok(Some(PropertyValue::Url(UrlValue::new(resolved_url))));
}
}
Ok(None)
}
pub fn extract_dimensions(&self, node: &Node) -> Option<(u32, u32)> {
let width = node.attr("width").and_then(|w| w.parse().ok());
let height = node.attr("height").and_then(|h| h.parse().ok());
match (width, height) {
(Some(w), Some(h)) => Some((w, h)),
_ => None,
}
}
pub fn is_representative_image(&self, node: &Node) -> bool {
let class = node.attr("class").unwrap_or_default();
let id = node.attr("id").unwrap_or_default();
let representative_indicators =
["avatar", "profile", "logo", "photo", "picture", "portrait"];
representative_indicators
.iter()
.any(|indicator| class.contains(indicator) || id.contains(indicator))
}
pub async fn validate_image_url(&self, url: &Url) -> Result<bool, Error> {
tracing::debug!("Image URL validation not yet implemented for: {}", url);
Ok(true)
}
pub fn extract_alt_text(&self, node: &Node) -> Option<String> {
if let Some(alt) = node.attr("alt") {
if !alt.is_empty() {
return Some(alt);
}
}
None
}
pub fn config(&self) -> &PictureParserBuilder {
&self.config
}
pub fn parse_srcset(&self, srcset: &str) -> Result<Vec<SrcSetEntry>, Error> {
self.parse_srcset_with_base(srcset, &Url::parse("http://dummy-base.com/").unwrap())
}
pub fn parse_srcset_with_base(
&self,
srcset: &str,
base_url: &Url,
) -> Result<Vec<SrcSetEntry>, Error> {
if srcset.trim().is_empty() {
return Ok(Vec::new());
}
let mut entries = Vec::new();
let mut current_entry = String::new();
let mut in_quotes = false;
let mut quote_char = '\0';
for c in srcset.chars() {
match c {
'"' | '\'' => {
if in_quotes && c == quote_char {
in_quotes = false;
quote_char = '\0';
} else if !in_quotes {
in_quotes = true;
quote_char = c;
}
current_entry.push(c);
}
',' => {
if !in_quotes {
if !current_entry.trim().is_empty() {
let entry = self.parse_srcset_entry(current_entry.trim(), base_url)?;
entries.push(entry);
}
current_entry.clear();
} else {
current_entry.push(c);
}
}
_ => {
current_entry.push(c);
}
}
}
if !current_entry.trim().is_empty() {
let entry = self.parse_srcset_entry(current_entry.trim(), base_url)?;
entries.push(entry);
}
Ok(entries)
}
fn parse_srcset_entry(&self, entry: &str, base_url: &Url) -> Result<SrcSetEntry, Error> {
if entry.trim().is_empty() {
return Err(Error::HtmlCodegen(format!(
"Empty srcset entry: '{}'",
entry
)));
}
let trimmed_entry = entry.trim();
let parts: Vec<&str> = trimmed_entry.split_whitespace().collect();
if parts.is_empty() {
return Err(Error::HtmlCodegen(format!(
"Empty srcset entry: '{}'",
entry
)));
}
let mut last_valid_descriptor_index = None;
for i in (1..parts.len()).rev() {
if self.is_valid_descriptor(parts[i]) {
last_valid_descriptor_index = Some(i);
break;
}
}
let (url_part, descriptor_part);
if let Some(valid_idx) = last_valid_descriptor_index {
let mut descriptor_start = valid_idx;
for i in (1..valid_idx).rev() {
if self.is_valid_descriptor(parts[i]) {
descriptor_start = i;
} else {
break;
}
}
let url_parts = &parts[0..descriptor_start];
let descriptor_parts = &parts[descriptor_start..];
url_part = url_parts.join(" ");
descriptor_part = descriptor_parts.join(" ");
} else {
url_part = trimmed_entry.to_string();
descriptor_part = String::new();
}
if url_part.is_empty() {
return Err(Error::HtmlCodegen(format!(
"No URL found in srcset entry: '{}'",
entry
)));
}
let resolved_url = if url_part.starts_with("http://")
|| url_part.starts_with("https://")
|| url_part.starts_with("data:")
{
Url::parse(&url_part).map_err(|e| {
Error::HtmlCodegen(format!(
"Invalid absolute URL in srcset '{}': {}",
url_part, e
))
})?
} else {
base_url.join(&url_part).map_err(|e| {
Error::HtmlCodegen(format!(
"Invalid relative URL in srcset '{}': {}",
url_part, e
))
})?
};
let mut width = None;
let mut height = None;
let mut density = None;
if !descriptor_part.is_empty() {
for part in descriptor_part.split_whitespace() {
if let Some(w) = part.strip_suffix('w').and_then(|w| w.parse().ok()) {
width = Some(w);
} else if let Some(h) = part.strip_suffix('h').and_then(|h| h.parse().ok()) {
height = Some(h);
} else if part == "x" || part.ends_with('x') {
let density_str = if part == "x" {
"1x".to_string()
} else {
part.to_string()
};
density = Some(density_str);
} else {
tracing::warn!(
"Unknown descriptor in srcset entry: '{}' in '{}'",
part,
entry
);
break;
}
}
}
Ok(SrcSetEntry {
url: resolved_url,
width,
height,
density,
})
}
fn is_valid_descriptor(&self, s: &str) -> bool {
match s {
"x" => true, s if s.ends_with('w') => {
s.strip_suffix('w')
.map_or(false, |num| !num.is_empty() && num.parse::<u32>().is_ok())
}
s if s.ends_with('h') => {
s.strip_suffix('h')
.map_or(false, |num| !num.is_empty() && num.parse::<u32>().is_ok())
}
s if s.ends_with('x') => {
s.strip_suffix('x')
.map_or(true, |num| num.is_empty() || num.parse::<f64>().is_ok())
}
_ => false, }
}
}
impl Default for PictureParserBuilder {
fn default() -> Self {
Self {
extract_alt_text: true,
follow_redirects: false,
max_file_size: None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parse::element::test::{from_html_str, grab_element_from_document};
use microformats_types::{Class, PropertyValue};
use std::str::FromStr;
use url::Url;
#[test]
fn test_picture_source_creation() {
let source = PictureSource {
srcset: vec![SrcSetEntry {
url: "https://example.com/image.jpg".parse().unwrap(),
width: Some(800),
height: None,
density: Some("1x".to_string()),
}],
media: "screen and (min-width: 800px)".to_string(),
r#type: "image/jpeg".to_string(),
sizes: "100vw".to_string(),
attributes: BTreeMap::new(),
};
assert_eq!(source.srcset.len(), 1);
assert_eq!(source.media, "screen and (min-width: 800px)");
assert_eq!(source.r#type, "image/jpeg");
}
#[test]
fn test_picture_creation() {
let fallback = Image {
value: "https://example.com/fallback.jpg".parse().unwrap(),
alt: Some("Fallback image".to_string()),
};
let source = PictureSource {
srcset: vec![SrcSetEntry {
url: "https://example.com/responsive.jpg".parse().unwrap(),
width: Some(1200),
height: None,
density: None,
}],
media: "screen and (min-width: 1200px)".to_string(),
r#type: "image/webp".to_string(),
sizes: "50vw".to_string(),
attributes: BTreeMap::new(),
};
let picture = Picture {
sources: vec![source],
fallback: Some(fallback),
attributes: BTreeMap::new(),
};
assert_eq!(picture.sources.len(), 1);
assert!(picture.fallback.is_some());
assert_eq!(
picture.fallback.as_ref().unwrap().alt,
Some("Fallback image".to_string())
);
}
#[test]
fn test_srcset_entry_serialization() {
let entry = SrcSetEntry {
url: "https://example.com/image-2x.jpg".parse().unwrap(),
width: None,
height: None,
density: Some("2x".to_string()),
};
let json = serde_json::to_string(&entry).unwrap();
assert!(json.contains("image-2x.jpg"));
assert!(json.contains("2x"));
}
#[test]
fn test_picture_serialization() {
let picture = Picture {
sources: vec![],
fallback: None,
attributes: BTreeMap::new(),
};
let json = serde_json::to_string(&picture).unwrap();
println!("JSON output: {}", json);
assert_eq!(json, "{}");
}
#[test]
fn test_picture_parser_creation() {
let parser = PictureParser::new();
assert!(parser.config().extract_alt_text);
}
#[test]
fn test_picture_parser_builder() {
let builder = PictureParserBuilder {
extract_alt_text: false,
follow_redirects: true,
max_file_size: Some(1024 * 1024),
};
let parser = PictureParser::with_config(builder);
assert!(!parser.config().extract_alt_text);
assert!(parser.config().follow_redirects);
assert_eq!(parser.config().max_file_size, Some(1024 * 1024));
}
#[test]
fn test_extract_dimensions() {
let html = r#"<img src="test.jpg" width="100" height="200" alt="Test">"#;
let doc = from_html_str(html);
let element = grab_element_from_document(&doc, "img").unwrap();
let parser = PictureParser::new();
let node = crate::parse::element::Node { elem: element };
let dimensions = parser.extract_dimensions(&node);
assert_eq!(dimensions, Some((100, 200)));
}
#[test]
fn test_is_representative_image() {
let html = r#"<img src="avatar.jpg" class="u-photo" alt="Profile">"#;
let doc = from_html_str(html);
let element = grab_element_from_document(&doc, "img").unwrap();
let parser = PictureParser::new();
let node = crate::parse::element::Node { elem: element };
assert!(parser.is_representative_image(&node));
}
#[test]
fn test_extract_alt_text() {
let html = r#"<img src="test.jpg" alt="Alternative text">"#;
let doc = from_html_str(html);
let element = grab_element_from_document(&doc, "img").unwrap();
let parser = PictureParser::new();
let node = crate::parse::element::Node { elem: element };
let alt_text = parser.extract_alt_text(&node);
assert_eq!(alt_text, Some("Alternative text".to_string()));
}
#[test]
fn test_parse_simple_srcset() {
let parser = PictureParser::new();
let srcset =
"https://example.com/image-400.jpg 400w, https://example.com/image-800.jpg 800w";
let result = parser.parse_srcset(srcset).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].url.as_str(), "https://example.com/image-400.jpg");
assert_eq!(result[0].width, Some(400));
assert_eq!(result[0].height, None);
assert_eq!(result[0].density, None);
assert_eq!(result[1].url.as_str(), "https://example.com/image-800.jpg");
assert_eq!(result[1].width, Some(800));
assert_eq!(result[1].height, None);
assert_eq!(result[1].density, None);
}
#[test]
fn test_parse_srcset_with_density_descriptors() {
let parser = PictureParser::new();
let srcset = "https://example.com/image-1x.jpg 1x, https://example.com/image-2x.jpg 2x";
let result = parser.parse_srcset(srcset).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].url.as_str(), "https://example.com/image-1x.jpg");
assert_eq!(result[0].density, Some("1x".to_string()));
assert_eq!(result[1].url.as_str(), "https://example.com/image-2x.jpg");
assert_eq!(result[1].density, Some("2x".to_string()));
}
#[test]
fn test_parse_srcset_with_height_descriptors() {
let parser = PictureParser::new();
let srcset =
"https://example.com/image-600h.jpg 600h, https://example.com/image-900h.jpg 900h";
let result = parser.parse_srcset(srcset).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].url.as_str(), "https://example.com/image-600h.jpg");
assert_eq!(result[0].height, Some(600));
assert_eq!(result[1].url.as_str(), "https://example.com/image-900h.jpg");
assert_eq!(result[1].height, Some(900));
}
#[test]
fn test_parse_srcset_with_multiple_descriptors() {
let parser = PictureParser::new();
let srcset = "https://example.com/image-400x600-2x.jpg 400w 600h 2x, https://example.com/image-800x1200-1x.jpg 800w 1200h 1x";
let result = parser.parse_srcset(srcset).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(
result[0].url.as_str(),
"https://example.com/image-400x600-2x.jpg"
);
assert_eq!(result[0].width, Some(400));
assert_eq!(result[0].height, Some(600));
assert_eq!(result[0].density, Some("2x".to_string()));
assert_eq!(
result[1].url.as_str(),
"https://example.com/image-800x1200-1x.jpg"
);
assert_eq!(result[1].width, Some(800));
assert_eq!(result[1].height, Some(1200));
assert_eq!(result[1].density, Some("1x".to_string()));
}
#[test]
fn test_parse_srcset_with_urls_containing_spaces() {
let parser = PictureParser::new();
let srcset = "https://example.com/path%20to%20image.jpg 400w, https://example.com/path with spaces.jpg 800w";
let result = parser.parse_srcset(srcset).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(
result[0].url.as_str(),
"https://example.com/path%20to%20image.jpg"
);
assert_eq!(result[0].width, Some(400));
assert_eq!(
result[1].url.as_str(),
"https://example.com/path%20with%20spaces.jpg"
);
assert_eq!(result[1].width, Some(800));
}
#[test]
fn test_parse_srcset_respects_commas() {
let parser = PictureParser::new();
let srcset = "https://example.com/image1.jpg, https://example.com/image2.jpg 400w, https://example.com/image3.jpg, https://example.com/image4.jpg 2x";
let result = parser.parse_srcset(srcset).unwrap();
assert_eq!(result.len(), 4);
assert_eq!(result[0].url.as_str(), "https://example.com/image1.jpg");
assert_eq!(result[1].url.as_str(), "https://example.com/image2.jpg");
assert_eq!(result[2].url.as_str(), "https://example.com/image3.jpg");
assert_eq!(result[3].url.as_str(), "https://example.com/image4.jpg");
}
#[test]
fn test_parse_srcset_handles_malformed_input() {
let parser = PictureParser::new();
let srcset = "https://example.com/image.jpg 400w invalid, https://example.com/another.jpg";
let result = parser.parse_srcset(srcset).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].url.as_str(), "https://example.com/image.jpg");
assert_eq!(result[0].width, Some(400));
assert_eq!(result[1].url.as_str(), "https://example.com/another.jpg");
}
#[test]
fn test_parse_srcset_empty_input() {
let parser = PictureParser::new();
let result = parser.parse_srcset("").unwrap();
assert_eq!(result.len(), 0);
}
#[test]
fn test_parse_srcset_url_parsing() {
let parser = PictureParser::new();
let base_url = Url::parse("https://example.com/").unwrap();
let srcset = "/images/hero-800w.jpg 800w, /images/hero-400w.jpg 400w";
let result = parser.parse_srcset_with_base(srcset, &base_url).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(
result[0].url.as_str(),
"https://example.com/images/hero-800w.jpg"
);
assert_eq!(
result[1].url.as_str(),
"https://example.com/images/hero-400w.jpg"
);
}
#[test]
fn test_parse_picture_element_basic() {
let html = r#"
<picture>
<source srcset="/images/hero-800w.jpg 800w, /images/hero-400w.jpg 400w" media="(min-width: 800px)" type="image/jpeg">
<source srcset="/images/hero-600h.jpg 600h" media="(min-width: 600px)" type="image/webp">
<img src="/images/fallback.jpg" alt="Hero image">
</picture>
"#;
let doc = from_html_str(html);
let element = grab_element_from_document(&doc, "picture").unwrap();
let base_url = Url::parse("https://example.com/").unwrap();
let parser = PictureParser::new();
let node = Node { elem: element };
let result = parser.parse_picture_element(&node, &base_url).unwrap();
assert!(result.is_some());
let picture = result.unwrap();
assert_eq!(picture.sources.len(), 2);
assert_eq!(picture.sources[0].media, "(min-width: 800px)");
assert_eq!(picture.sources[0].r#type, "image/jpeg");
assert_eq!(picture.sources[0].srcset.len(), 2);
assert_eq!(
picture.sources[0].srcset[0].url.as_str(),
"https://example.com/images/hero-800w.jpg"
);
assert_eq!(picture.sources[0].srcset[0].width, Some(800));
assert_eq!(
picture.sources[0].srcset[1].url.as_str(),
"https://example.com/images/hero-400w.jpg"
);
assert_eq!(picture.sources[0].srcset[1].width, Some(400));
assert_eq!(picture.sources[1].media, "(min-width: 600px)");
assert_eq!(picture.sources[1].r#type, "image/webp");
assert_eq!(picture.sources[1].srcset.len(), 1);
assert_eq!(
picture.sources[1].srcset[0].url.as_str(),
"https://example.com/images/hero-600h.jpg"
);
assert_eq!(picture.sources[1].srcset[0].height, Some(600));
assert!(picture.fallback.is_some());
let fallback = picture.fallback.unwrap();
assert_eq!(
fallback.value.as_str(),
"https://example.com/images/fallback.jpg"
);
assert_eq!(fallback.alt, Some("Hero image".to_string()));
}
#[test]
fn test_parse_picture_element_with_sizes() {
let html = r#"
<picture>
<source srcset="/images/responsive-800w.jpg 800w, /images/responsive-400w.jpg 400w"
media="(min-width: 800px)"
type="image/jpeg"
sizes="(min-width: 800px) 800px, 400px">
<img src="/images/fallback-400.jpg" alt="Responsive image" width="400" height="300">
</picture>
"#;
let doc = from_html_str(html);
let element = grab_element_from_document(&doc, "picture").unwrap();
let base_url = Url::parse("https://example.com/").unwrap();
let parser = PictureParser::new();
let node = Node { elem: element };
let result = parser.parse_picture_element(&node, &base_url).unwrap();
assert!(result.is_some());
let picture = result.unwrap();
assert_eq!(picture.sources.len(), 1);
let source = &picture.sources[0];
assert_eq!(source.sizes, "(min-width: 800px) 800px, 400px");
assert_eq!(source.srcset.len(), 2);
assert!(picture.fallback.is_some());
let fallback = picture.fallback.unwrap();
assert_eq!(
fallback.value.as_str(),
"https://example.com/images/fallback-400.jpg"
);
}
#[test]
fn test_parse_picture_element_empty_picture() {
let html = r#"<picture></picture>"#;
let doc = from_html_str(html);
let element = grab_element_from_document(&doc, "picture").unwrap();
let base_url = Url::parse("https://example.com/").unwrap();
let parser = PictureParser::new();
let node = Node { elem: element };
let result = parser.parse_picture_element(&node, &base_url).unwrap();
assert!(result.is_some());
let picture = result.unwrap();
assert_eq!(picture.sources.len(), 0);
assert!(picture.fallback.is_none());
}
#[test]
fn test_parse_picture_element_only_sources() {
let html = r#"
<picture>
<source srcset="/images/hero-800w.jpg 800w" media="(min-width: 800px)" type="image/webp">
<source srcset="/images/hero-400w.jpg 400w" media="(min-width: 400px)" type="image/png">
</picture>
"#;
let doc = from_html_str(html);
let element = grab_element_from_document(&doc, "picture").unwrap();
let base_url = Url::parse("https://example.com/").unwrap();
let parser = PictureParser::new();
let node = Node { elem: element };
let result = parser.parse_picture_element(&node, &base_url).unwrap();
assert!(result.is_some());
let picture = result.unwrap();
assert_eq!(picture.sources.len(), 2);
assert!(picture.fallback.is_none());
assert_eq!(
picture.sources[0].srcset[0].url.as_str(),
"https://example.com/images/hero-800w.jpg"
);
assert_eq!(
picture.sources[1].srcset[0].url.as_str(),
"https://example.com/images/hero-400w.jpg"
);
}
#[test]
fn test_parse_picture_element_only_fallback() {
let html = r#"
<picture>
<img src="/images/single.jpg" alt="Single image">
</picture>
"#;
let doc = from_html_str(html);
let element = grab_element_from_document(&doc, "picture").unwrap();
let base_url = Url::parse("https://example.com/").unwrap();
let parser = PictureParser::new();
let node = Node { elem: element };
let result = parser.parse_picture_element(&node, &base_url).unwrap();
assert!(result.is_some());
let picture = result.unwrap();
assert_eq!(picture.sources.len(), 0);
assert!(picture.fallback.is_some());
let fallback = picture.fallback.unwrap();
assert_eq!(
fallback.value.as_str(),
"https://example.com/images/single.jpg"
);
assert_eq!(fallback.alt, Some("Single image".to_string()));
}
fn parse_html_to_item(html: &str) -> Result<microformats_types::Item, crate::Error> {
let base_url = Url::parse("https://example.com/").unwrap();
let mut parser = crate::parse::Parser::from_html(html.to_string())?;
let document = parser.into_document(Some(base_url))?;
document
.items
.into_iter()
.next()
.ok_or_else(|| crate::Error::HtmlCodegen("No items found".to_string()))
}
#[test]
fn test_pname_from_picture() -> Result<(), crate::Error> {
let html = r#"
<article class="h-entry">
<h1 class="p-name">Blog Post Title</h1>
<picture class="p-name">
<source srcset="/images/hero-800w.jpg 800w, /images/hero-400w.jpg 400w"
media="(min-width: 800px)" type="image/jpeg">
<img src="/images/fallback.jpg" alt="Hero image">
</picture>
</article>
"#;
let item = parse_html_to_item(html)?;
let name_property = item
.properties
.get("name")
.ok_or_else(|| crate::Error::HtmlCodegen("No name property found".to_string()))?;
assert_eq!(name_property.len(), 2);
let mut found_blog_title = false;
let mut found_hero_image = false;
for val in name_property.iter() {
if let PropertyValue::Plain(plain) = val {
let text = plain.to_string();
if text == "Blog Post Title" {
found_blog_title = true;
} else if text == "Hero image" {
found_hero_image = true;
}
}
}
assert!(found_blog_title, "Blog Post Title not found");
assert!(found_hero_image, "Hero image not found");
Ok(())
}
#[test]
fn test_multiple_pictures() -> Result<(), crate::Error> {
let html = r#"
<article class="h-entry">
<h1 class="p-name">Blog Post with Images</h1>
<picture class="u-photo">
<source srcset="/images/hero-800w.jpg 800w, /images/hero-400w.jpg 400w"
media="(min-width: 800px)" type="image/jpeg">
<img src="/images/hero-fallback.jpg" alt="Hero image">
</picture>
<picture class="u-photo">
<source srcset="/images/thumb-200w.jpg 200w" media="(min-width: 200px)" type="image/png">
<img src="/images/thumb-fallback.png" alt="Thumbnail">
</picture>
</article>
"#;
let item = parse_html_to_item(html)?;
let photo_property = item
.properties
.get("photo")
.ok_or_else(|| crate::Error::HtmlCodegen("No photo property found".to_string()))?;
assert_eq!(photo_property.len(), 2);
match &photo_property[0] {
PropertyValue::Image(image) => {
assert_eq!(
image.value.as_str(),
"https://example.com/images/hero-fallback.jpg"
);
assert_eq!(image.alt, Some("Hero image".to_string()));
}
_ => panic!(
"Expected Image property value for first photo, got: {:?}",
photo_property[0]
),
}
match &photo_property[1] {
PropertyValue::Image(image) => {
assert_eq!(
image.value.as_str(),
"https://example.com/images/thumb-fallback.png"
);
assert_eq!(image.alt, Some("Thumbnail".to_string()));
}
_ => panic!(
"Expected Image property value for second photo, got: {:?}",
photo_property[1]
),
}
Ok(())
}
#[test]
fn test_picture_multiple_sources() -> Result<(), crate::Error> {
let html = r#"
<article class="h-entry">
<h1 class="p-name">Responsive Image Post</h1>
<picture class="u-photo">
<source srcset="/images/desktop-1200w.jpg 1200w, /images/desktop-800w.jpg 800w"
media="(min-width: 1200px)" type="image/jpeg">
<source srcset="/images/tablet-800w.jpg 800w, /images/tablet-400w.jpg 400w"
media="(min-width: 768px)" type="image/webp">
<source srcset="/images/mobile-400w.jpg 400w"
media="(min-width: 320px)" type="image/png">
<img src="/images/fallback.jpg" alt="Responsive hero image">
</picture>
</article>
"#;
let item = parse_html_to_item(html)?;
let name_property = item
.properties
.get("name")
.ok_or_else(|| crate::Error::HtmlCodegen("No name property found".to_string()))?;
assert_eq!(name_property.len(), 1);
match &name_property[0] {
PropertyValue::Plain(plain) => {
assert_eq!(plain.to_string(), "Responsive Image Post");
}
_ => panic!(
"Expected Plain property value for p-name, got: {:?}",
name_property[0]
),
}
Ok(())
}
#[test]
fn test_picture_data_urls() -> Result<(), crate::Error> {
let html = r#"
<article class="h-entry">
<h1 class="p-name">Post with Data URL</h1>
<picture class="u-photo">
<source srcset="data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAYEBQYFBAYGBQYHBwYIChAKCgkJChQODwwQFxQYGBcUFhYaHSUfGhsjHBYWICwgIyYnKSopGR8tMC0oMCUoKSj/2wBDAQcHBwoIChMKChMoGhYaKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCj/wAARCAABAAEDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwCdABmX/9k= 1x"
type="image/jpeg">
<img src="data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAYEBQYFBAYGBQYHBwYIChAKCgkJChQODwwQFxQYGBcUFhYaHSUfGhsjHBYWICwgIyYnKSopGR8tMC0oMCUoKSj/2wBDAQcHBwoIChMKChMoGhYaKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCj/wAARCAABAAEDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwCdABmX/9k= 2x"
alt="Data URL image">
</picture>
</article>
"#;
let item = parse_html_to_item(html)?;
let photo_property = item
.properties
.get("photo")
.ok_or_else(|| crate::Error::HtmlCodegen("No photo property found".to_string()))?;
assert_eq!(photo_property.len(), 1);
match &photo_property[0] {
PropertyValue::Image(image) => {
assert!(image.value.as_str().starts_with("data:image/jpeg;base64,"));
assert_eq!(image.alt, Some("Data URL image".to_string()));
}
_ => panic!(
"Expected Image property value, got: {:?}",
photo_property[0]
),
}
Ok(())
}
#[test]
fn test_realistic_blog_post() -> Result<(), crate::Error> {
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>My Travel Blog</title>
</head>
<body>
<article class="h-entry" id="travel-post-1">
<h1 class="p-name">Amazing Trip to Japan</h1>
<div class="p-author h-card">
<img class="u-photo" src="/images/author-avatar.jpg" alt="John Traveler">
<span class="p-name">John Traveler</span>
</div>
<picture class="u-photo">
<source srcset="/images/japan-hero-1200w.jpg 1200w, /images/japan-hero-800w.jpg 800w, /images/japan-hero-400w.jpg 400w"
media="(min-width: 1200px)" type="image/jpeg">
<source srcset="/images/japan-hero-tablet-800w.jpg 800w, /images/japan-hero-tablet-400w.jpg 400w"
media="(min-width: 768px)" type="image/webp">
<source srcset="/images/japan-hero-mobile-400w.jpg 400w, /images/japan-hero-mobile-200w.jpg 200w"
media="(min-width: 320px)" type="image/png">
<img src="/images/japan-hero-fallback.jpg" alt="Beautiful cherry blossoms in Tokyo">
</picture>
<div class="e-content">
<p>I recently visited Japan and it was absolutely amazing! The cherry blossoms were in full bloom.</p>
<picture class="u-photo">
<source srcset="/images/cherry-blossom-600w.jpg 600w"
media="(min-width: 600px)" type="image/jpeg">
<img src="/images/cherry-blossom-300w.jpg" alt="Close-up of cherry blossoms">
</picture>
<p>The temples were breathtaking and the food was incredible.</p>
</div>
</article>
</body>
</html>
"#;
let base_url = Url::parse("https://mytravelblog.example.com/").unwrap();
let mut parser = crate::parse::Parser::from_html(html.to_string())?;
let document = parser.into_document(Some(base_url))?;
println!("Number of items found: {}", document.items.len());
for (i, item) in document.items.iter().enumerate() {
println!("Item {}: types={:?}", i, item.r#type);
println!(
" Properties: {:?}",
item.properties.keys().collect::<Vec<_>>()
);
}
assert!(
document.items.len() >= 1,
"Should have at least the h-entry item"
);
let h_entry = &document.items[0];
assert_eq!(h_entry.r#type, vec![Class::from_str("h-entry").unwrap()]);
assert_eq!(h_entry.id, Some("travel-post-1".to_string()));
let expected_properties = ["name", "photo", "content"];
for prop in &expected_properties {
assert!(
h_entry.properties.contains_key(&prop.to_string()),
"Missing property: {}",
prop
);
}
let photo_property = h_entry
.properties
.get("photo")
.ok_or_else(|| crate::Error::HtmlCodegen("No photo property found".to_string()))?;
assert_eq!(photo_property.len(), 2);
match &photo_property[0] {
PropertyValue::Image(image) => {
assert_eq!(
image.value.as_str(),
"https://mytravelblog.example.com/images/japan-hero-fallback.jpg"
);
assert_eq!(
image.alt,
Some("Beautiful cherry blossoms in Tokyo".to_string())
);
}
_ => panic!("Expected Image property value for hero image"),
}
match &photo_property[1] {
PropertyValue::Image(image) => {
assert_eq!(
image.value.as_str(),
"https://mytravelblog.example.com/images/cherry-blossom-300w.jpg"
);
assert_eq!(image.alt, Some("Close-up of cherry blossoms".to_string()));
}
_ => panic!("Expected Image property value for cherry blossom image"),
}
if document.items.len() > 1 {
let h_card = &document.items[1];
assert_eq!(h_card.r#type, vec![Class::from_str("h-card").unwrap()]);
let h_card_photo = h_card.properties.get("photo").ok_or_else(|| {
crate::Error::HtmlCodegen("No photo property found in h-card".to_string())
})?;
assert_eq!(h_card_photo.len(), 1);
match &h_card_photo[0] {
PropertyValue::Image(image) => {
assert_eq!(
image.value.as_str(),
"https://mytravelblog.example.com/images/author-avatar.jpg"
);
assert_eq!(image.alt, Some("John Traveler".to_string()));
}
_ => panic!("Expected Image property value for author avatar"),
}
}
Ok(())
}
#[test]
fn test_picture_edge_cases() -> Result<(), crate::Error> {
let html = r#"
<article class="h-entry">
<h1 class="p-name">Post with Edge Cases</h1>
<!-- Picture with malformed srcset -->
<picture class="u-photo">
<source srcset="valid.jpg 400w, broken, valid2.jpg 800w"
media="(min-width: 800px)" type="image/jpeg">
<img src="/images/fallback.jpg" alt="Edge case image">
</picture>
<!-- Picture with empty picture element -->
<picture class="u-photo">
</picture>
<!-- Picture with only sources, no img -->
<picture class="u-photo">
<source srcset="/images/only-sources.jpg 400w" type="image/jpeg">
</picture>
</article>
"#;
let base_url = Url::parse("https://example.com/").unwrap();
let mut parser = crate::parse::Parser::from_html(html.to_string())?;
let document = parser.into_document(Some(base_url))?;
assert_eq!(document.items.len(), 1);
let item = &document.items[0];
let photo_property = item.properties.get("photo");
assert!(photo_property.is_some());
Ok(())
}
#[test]
fn test_picture_performance() -> Result<(), crate::Error> {
use std::time::Instant;
let mut html =
String::from(r#"<article class="h-entry"><h1 class="p-name">Performance Test</h1>"#);
for i in 0..50 {
html.push_str(&format!(
r#"
<picture class="u-photo">
<source srcset="/images/{}-800w.jpg 800w, /images/{}-400w.jpg 400w"
media="(min-width: 800px)" type="image/jpeg">
<img src="/images/{}-fallback.jpg" alt="Image {}">
</picture>
"#,
i, i, i, i
));
}
html.push_str("</article>");
let start = Instant::now();
let base_url = Url::parse("https://example.com/").unwrap();
let mut parser = crate::parse::Parser::from_html(html)?;
let document = parser.into_document(Some(base_url))?;
let duration = start.elapsed();
assert!(
duration.as_millis() < 5000,
"Picture parsing took too long: {:?}",
duration
);
assert_eq!(document.items.len(), 1);
let item = &document.items[0];
let photo_property = item.properties.get("photo").unwrap();
assert_eq!(photo_property.len(), 50);
println!("Parsed 50 picture elements in {:?}", duration);
Ok(())
}
#[test]
fn test_picture_empty_srcset() -> Result<(), crate::Error> {
let html = r#"
<article class="h-entry">
<h1 class="p-name">Post with Empty Source</h1>
<picture class="u-photo">
<source srcset="" media="(min-width: 800px)" type="image/jpeg">
<img src="/images/fallback.jpg" alt="Fallback image">
</picture>
</article>
"#;
let item = parse_html_to_item(html)?;
let photo_property = item.properties.get("photo");
assert!(photo_property.is_some());
assert_eq!(photo_property.unwrap().len(), 1);
Ok(())
}
#[test]
fn test_picture_no_alt_text() -> Result<(), crate::Error> {
let html = r#"
<article class="h-entry">
<h1 class="p-name">Post with Image No Alt</h1>
<picture class="u-photo">
<source srcset="/images/hero-800w.jpg 800w, /images/hero-400w.jpg 400w"
media="(min-width: 800px)" type="image/jpeg">
<img src="/images/fallback.jpg">
</picture>
</article>
"#;
let item = parse_html_to_item(html)?;
let photo_property = item
.properties
.get("photo")
.ok_or_else(|| crate::Error::HtmlCodegen("No photo property found".to_string()))?;
assert_eq!(photo_property.len(), 1);
match &photo_property[0] {
PropertyValue::Image(image) => {
assert_eq!(
image.value.as_str(),
"https://example.com/images/fallback.jpg"
);
assert_eq!(image.alt, None); }
PropertyValue::Url(url_value) => {
assert_eq!(
url_value.as_str(),
"https://example.com/images/fallback.jpg"
);
}
_ => panic!(
"Expected Image or Url property value, got: {:?}",
photo_property[0]
),
}
Ok(())
}
#[test]
fn test_picture_in_hcard() -> Result<(), crate::Error> {
let html = r#"
<div class="h-card">
<img class="u-photo" src="/avatar.jpg" alt="John Doe">
</div>
<article class="h-entry">
<h1 class="p-name">Post by John</h1>
<picture class="u-photo">
<source srcset="/images/author-400w.jpg 400w, /images/author-200w.jpg 200w"
media="(min-width: 400px)" type="image/jpeg">
<img src="/images/author-fallback.jpg" alt="John's photo">
</picture>
</article>
"#;
let base_url = Url::parse("https://example.com/").unwrap();
let mut parser = crate::parse::Parser::from_html(html.to_string())?;
let document = parser.into_document(Some(base_url))?;
assert_eq!(document.items.len(), 2);
let h_card = &document.items[0];
assert_eq!(h_card.r#type, vec![Class::from_str("h-card").unwrap()]);
let h_card_photo = h_card.properties.get("photo").ok_or_else(|| {
crate::Error::HtmlCodegen("No photo property found in h-card".to_string())
})?;
assert_eq!(h_card_photo.len(), 1);
let h_entry = &document.items[1];
assert_eq!(h_entry.r#type, vec![Class::from_str("h-entry").unwrap()]);
let h_entry_photo = h_entry.properties.get("photo").ok_or_else(|| {
crate::Error::HtmlCodegen("No photo property found in h-entry".to_string())
})?;
assert_eq!(h_entry_photo.len(), 1);
Ok(())
}
#[test]
fn test_picture_relative_urls() -> Result<(), crate::Error> {
let html = r#"
<article class="h-entry">
<h1 class="p-name">Post with Relative URLs</h1>
<picture class="u-photo">
<source srcset="images/hero-800w.jpg 800w, images/hero-400w.jpg 400w"
media="(min-width: 800px)" type="image/jpeg">
<img src="images/fallback.jpg" alt="Relative path image">
</picture>
</article>
"#;
let item = parse_html_to_item(html)?;
let photo_property = item
.properties
.get("photo")
.ok_or_else(|| crate::Error::HtmlCodegen("No photo property found".to_string()))?;
assert_eq!(photo_property.len(), 1);
match &photo_property[0] {
microformats_types::PropertyValue::Image(image) => {
assert_eq!(
image.value.as_str(),
"https://example.com/images/fallback.jpg"
);
assert_eq!(image.alt, Some("Relative path image".to_string()));
}
_ => panic!(
"Expected Image property value, got: {:?}",
photo_property[0]
),
}
Ok(())
}
}