use crate::page::NodeHandle;
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct ElementFingerprint {
pub tag: String,
pub classes: Vec<String>,
#[serde(rename = "attrNames")]
pub attr_names: Vec<String>,
pub depth: u32,
}
#[derive(Debug, Clone)]
pub struct SimilarityConfig {
pub threshold: f32,
pub max_results: usize,
}
impl SimilarityConfig {
pub const DEFAULT_THRESHOLD: f32 = 0.7;
}
impl Default for SimilarityConfig {
fn default() -> Self {
Self {
threshold: Self::DEFAULT_THRESHOLD,
max_results: 10,
}
}
}
pub struct SimilarMatch {
pub node: NodeHandle,
pub score: f32,
}
pub fn jaccard_weighted(reference: &ElementFingerprint, candidate: &ElementFingerprint) -> f32 {
let tag_score = if reference.tag == candidate.tag {
1.0_f32
} else {
0.0_f32
};
let class_score = jaccard_sets(&reference.classes, &candidate.classes);
let attr_score = jaccard_sets(&reference.attr_names, &candidate.attr_names);
let ref_depth = f32::from(u16::try_from(reference.depth).unwrap_or(u16::MAX));
let cand_depth = f32::from(u16::try_from(candidate.depth).unwrap_or(u16::MAX));
let depth_diff = (ref_depth - cand_depth).abs();
let depth_score = 1.0_f32 / (1.0_f32 + depth_diff);
depth_score.mul_add(
0.1_f32,
attr_score.mul_add(0.15_f32, tag_score.mul_add(0.4_f32, class_score * 0.35_f32)),
)
}
fn jaccard_sets(a: &[String], b: &[String]) -> f32 {
if a.is_empty() && b.is_empty() {
return 1.0_f32;
}
let mut intersection: usize = 0;
let mut i = 0_usize;
let mut j = 0_usize;
while i < a.len() && j < b.len() {
let (Some(ai), Some(bj)) = (a.get(i), b.get(j)) else {
break;
};
match ai.cmp(bj) {
std::cmp::Ordering::Equal => {
intersection += 1;
i += 1;
j += 1;
}
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
}
}
let union = a.len() + b.len() - intersection;
let i_f = f32::from(u16::try_from(intersection).unwrap_or(u16::MAX));
let u_f = f32::from(u16::try_from(union).unwrap_or(u16::MAX));
i_f / u_f
}
#[cfg(test)]
#[allow(clippy::expect_used)] mod tests {
use super::*;
fn fp(tag: &str, classes: &[&str], attrs: &[&str], depth: u32) -> ElementFingerprint {
ElementFingerprint {
tag: tag.to_string(),
classes: classes.iter().map(|s| (*s).to_string()).collect(),
attr_names: attrs.iter().map(|s| (*s).to_string()).collect(),
depth,
}
}
#[test]
fn jaccard_identical() {
let a = fp("div", &["card", "highlighted"], &["data-id"], 3);
let b = a.clone();
let score = jaccard_weighted(&a, &b);
assert!(
(score - 1.0_f32).abs() < 1e-5_f32,
"identical fingerprints should score 1.0, got {score}"
);
}
#[test]
fn jaccard_disjoint() {
let a = fp("div", &["foo", "bar"], &["data-x"], 0);
let b = fp("span", &["baz", "qux"], &["data-y"], 20);
let score = jaccard_weighted(&a, &b);
assert!(
score < 0.05_f32,
"disjoint fingerprints should score near 0, got {score}"
);
assert!(score >= 0.0_f32, "score must be non-negative, got {score}");
}
#[test]
fn jaccard_partial() {
let a = fp("div", &["a", "b"], &[], 2);
let b = fp("div", &["a", "c"], &[], 2);
let score = jaccard_weighted(&a, &b);
assert!(
score > 0.5_f32,
"partial-match fingerprint should score > 0.5, got {score}"
);
assert!(
score < 0.9_f32,
"partial-match fingerprint should score < 0.9, got {score}"
);
}
#[test]
fn similarity_config_default_threshold() {
assert!(
(SimilarityConfig::DEFAULT_THRESHOLD - 0.7_f32).abs() < f32::EPSILON,
"DEFAULT_THRESHOLD should be 0.7"
);
let cfg = SimilarityConfig::default();
assert!(
(cfg.threshold - SimilarityConfig::DEFAULT_THRESHOLD).abs() < f32::EPSILON,
"default threshold should equal DEFAULT_THRESHOLD"
);
assert_eq!(cfg.max_results, 10);
}
#[test]
fn fingerprint_serde_roundtrip() {
let original = ElementFingerprint {
tag: "section".to_string(),
classes: vec!["main".to_string(), "wrapper".to_string()],
attr_names: vec!["aria-label".to_string(), "data-section".to_string()],
depth: 5,
};
let json = serde_json::to_string(&original).expect("serialize");
let decoded: ElementFingerprint = serde_json::from_str(&json).expect("deserialize");
assert_eq!(original, decoded);
}
#[test]
fn fingerprint_serde_attr_names_key() {
let fp_val = ElementFingerprint {
tag: "a".to_string(),
classes: vec![],
attr_names: vec!["href".to_string()],
depth: 1,
};
let json = serde_json::to_string(&fp_val).expect("serialize");
assert!(
json.contains("\"attrNames\""),
"JSON key must be 'attrNames', got: {json}"
);
}
}