use scraper::node::Element;
pub enum ElementAction {
Skip,
Transparent,
Block(BlockKind),
Inline(InlineKind),
}
pub enum BlockKind {
Paragraph,
Heading(u8),
Blockquote,
UnorderedList,
OrderedList,
ListItem,
PreFormatted,
HorizontalRule,
Table,
Div,
}
pub enum InlineKind {
Bold,
Italic,
Strikethrough,
Code,
Link,
Image,
LineBreak,
Superscript,
Subscript,
}
pub fn classify(el: &Element) -> ElementAction {
match el.name() {
"script" | "style" | "head" | "meta" | "link" | "title" | "noscript" => {
ElementAction::Skip
}
"p" => ElementAction::Block(BlockKind::Paragraph),
"h1" => ElementAction::Block(BlockKind::Heading(1)),
"h2" => ElementAction::Block(BlockKind::Heading(2)),
"h3" => ElementAction::Block(BlockKind::Heading(3)),
"h4" => ElementAction::Block(BlockKind::Heading(4)),
"h5" => ElementAction::Block(BlockKind::Heading(5)),
"h6" => ElementAction::Block(BlockKind::Heading(6)),
"blockquote" => ElementAction::Block(BlockKind::Blockquote),
"ul" | "menu" => ElementAction::Block(BlockKind::UnorderedList),
"ol" => ElementAction::Block(BlockKind::OrderedList),
"li" => ElementAction::Block(BlockKind::ListItem),
"pre" => ElementAction::Block(BlockKind::PreFormatted),
"hr" => ElementAction::Block(BlockKind::HorizontalRule),
"table" => ElementAction::Block(BlockKind::Table),
"thead" | "tbody" | "tfoot" | "tr" | "td" | "th" | "caption" | "colgroup" | "col" => {
ElementAction::Transparent
}
"div" | "section" | "article" | "main" | "header" | "footer" | "nav" | "aside"
| "figure" | "figcaption" | "details" | "summary" => {
ElementAction::Block(BlockKind::Div)
}
"strong" | "b" => ElementAction::Inline(InlineKind::Bold),
"em" | "i" => ElementAction::Inline(InlineKind::Italic),
"del" | "s" | "strike" => ElementAction::Inline(InlineKind::Strikethrough),
"code" | "tt" => ElementAction::Inline(InlineKind::Code),
"a" => ElementAction::Inline(InlineKind::Link),
"img" => ElementAction::Inline(InlineKind::Image),
"br" => ElementAction::Inline(InlineKind::LineBreak),
"sup" => ElementAction::Inline(InlineKind::Superscript),
"sub" => ElementAction::Inline(InlineKind::Subscript),
_ => ElementAction::Transparent,
}
}
pub fn is_tracking_pixel(el: &Element) -> bool {
let width = el.attr("width");
let height = el.attr("height");
if matches!(width, Some("1" | "0")) || matches!(height, Some("1" | "0")) {
return true;
}
let Some(src) = el.attr("src") else {
return true;
};
if src.is_empty() {
return true;
}
if src.starts_with("data:image/gif;base64,R0lGOD") {
return true;
}
if let Some(style) = el.attr("style") {
let style_lower = style.to_lowercase();
if style_lower.contains("width:1px")
|| style_lower.contains("width: 1px")
|| style_lower.contains("width:0")
|| style_lower.contains("height:1px")
|| style_lower.contains("height: 1px")
|| style_lower.contains("height:0")
|| style_lower.contains("display:none")
|| style_lower.contains("display: none")
{
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
use scraper::{Html, Selector};
fn classify_tag(tag: &str) -> ElementAction {
let html = format!("<{tag}></{tag}>");
let doc = Html::parse_fragment(&html);
let sel = Selector::parse(tag).unwrap();
let el = doc.select(&sel).next().unwrap();
classify(el.value())
}
fn img_is_pixel(attrs: &str) -> bool {
let html = format!("<div><img {attrs} ></div>");
let doc = Html::parse_fragment(&html);
let sel = Selector::parse("img").unwrap();
let el = doc.select(&sel).next().unwrap();
is_tracking_pixel(el.value())
}
fn div_is_hidden(attrs: &str) -> bool {
let html = format!("<div {attrs}></div>");
let doc = Html::parse_fragment(&html);
let sel = Selector::parse("div").unwrap();
let el = doc.select(&sel).next().unwrap();
is_hidden(el.value())
}
#[test]
fn classify_h1_is_heading_1() {
assert!(matches!(classify_tag("h1"), ElementAction::Block(BlockKind::Heading(1))));
}
#[test]
fn classify_h4_is_heading_4() {
assert!(matches!(classify_tag("h4"), ElementAction::Block(BlockKind::Heading(4))));
}
#[test]
fn classify_h5_is_heading_5() {
assert!(matches!(classify_tag("h5"), ElementAction::Block(BlockKind::Heading(5))));
}
#[test]
fn classify_h6_is_heading_6() {
assert!(matches!(classify_tag("h6"), ElementAction::Block(BlockKind::Heading(6))));
}
#[test]
fn classify_script_is_skip() {
assert!(matches!(classify_tag("script"), ElementAction::Skip));
}
#[test]
fn classify_table_is_block_table() {
assert!(matches!(classify_tag("table"), ElementAction::Block(BlockKind::Table)));
}
#[test]
fn classify_strong_is_inline_bold() {
assert!(matches!(classify_tag("strong"), ElementAction::Inline(InlineKind::Bold)));
}
#[test]
fn pixel_width_1_only() {
assert!(img_is_pixel(r#"src="x" width="1" height="100""#));
}
#[test]
fn pixel_height_1_only() {
assert!(img_is_pixel(r#"src="x" width="100" height="1""#));
}
#[test]
fn pixel_width_0_only() {
assert!(img_is_pixel(r#"src="x" width="0" height="100""#));
}
#[test]
fn pixel_no_src_is_pixel() {
assert!(img_is_pixel(r#"width="100" height="100""#));
}
#[test]
fn pixel_empty_src_is_pixel() {
assert!(img_is_pixel(r#"src="" width="100" height="100""#));
}
#[test]
fn pixel_transparent_gif_data_uri_is_pixel() {
assert!(img_is_pixel(
r#"src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" width="100" height="100""#
));
}
#[test]
fn pixel_style_width_1px() {
assert!(img_is_pixel(r#"src="x" style="width:1px""#));
}
#[test]
fn pixel_style_width_space_1px() {
assert!(img_is_pixel(r#"src="x" style="width: 1px""#));
}
#[test]
fn pixel_style_width_0() {
assert!(img_is_pixel(r#"src="x" style="width:0""#));
}
#[test]
fn pixel_style_height_1px() {
assert!(img_is_pixel(r#"src="x" style="height:1px""#));
}
#[test]
fn pixel_style_height_space_1px() {
assert!(img_is_pixel(r#"src="x" style="height: 1px""#));
}
#[test]
fn pixel_style_height_0() {
assert!(img_is_pixel(r#"src="x" style="height:0""#));
}
#[test]
fn pixel_style_display_none() {
assert!(img_is_pixel(r#"src="x" style="display:none""#));
}
#[test]
fn pixel_style_display_space_none() {
assert!(img_is_pixel(r#"src="x" style="display: none""#));
}
#[test]
fn pixel_normal_image_is_not_pixel() {
assert!(!img_is_pixel(
r#"src="https://example.com/cat.jpg" width="500" height="300""#
));
}
#[test]
fn hidden_display_none() {
assert!(div_is_hidden(r#"style="display:none""#));
}
#[test]
fn hidden_display_space_none() {
assert!(div_is_hidden(r#"style="display: none""#));
}
#[test]
fn hidden_visibility_hidden() {
assert!(div_is_hidden(r#"style="visibility:hidden""#));
}
#[test]
fn hidden_visibility_space_hidden() {
assert!(div_is_hidden(r#"style="visibility: hidden""#));
}
#[test]
fn hidden_font_size_0() {
assert!(div_is_hidden(r#"style="font-size:0""#));
}
#[test]
fn hidden_font_size_space_0() {
assert!(div_is_hidden(r#"style="font-size: 0""#));
}
#[test]
fn hidden_line_height_0() {
assert!(div_is_hidden(r#"style="line-height:0""#));
}
#[test]
fn hidden_line_height_space_0() {
assert!(div_is_hidden(r#"style="line-height: 0""#));
}
#[test]
fn hidden_height_0_with_overflow_no_spaces() {
assert!(div_is_hidden(r#"style="height:0;overflow:hidden""#));
}
#[test]
fn hidden_height_0_with_overflow_with_spaces() {
assert!(div_is_hidden(r#"style="height: 0;overflow: hidden""#));
}
#[test]
fn hidden_height_0_alone_is_not_hidden() {
assert!(!div_is_hidden(r#"style="height:0""#));
}
#[test]
fn hidden_height_space_0_alone_is_not_hidden() {
assert!(!div_is_hidden(r#"style="height: 0""#));
}
#[test]
fn hidden_max_height_0() {
assert!(div_is_hidden(r#"style="max-height:0""#));
}
#[test]
fn hidden_max_height_space_0() {
assert!(div_is_hidden(r#"style="max-height: 0""#));
}
#[test]
fn hidden_no_signal_in_style() {
assert!(!div_is_hidden(r#"style="color:red;font-weight:bold""#));
}
#[test]
fn hidden_no_style_attr_is_not_hidden() {
assert!(!div_is_hidden(""));
}
}
pub fn is_hidden(el: &Element) -> bool {
if let Some(style) = el.attr("style") {
let s = style.to_lowercase();
if s.contains("display:none")
|| s.contains("display: none")
|| s.contains("visibility:hidden")
|| s.contains("visibility: hidden")
|| s.contains("font-size:0")
|| s.contains("font-size: 0")
|| s.contains("line-height:0")
|| s.contains("line-height: 0")
|| (s.contains("height:0") && s.contains("overflow:hidden"))
|| (s.contains("height: 0") && s.contains("overflow: hidden"))
|| s.contains("max-height:0")
|| s.contains("max-height: 0")
{
return true;
}
}
false
}