use std::iter;
use super::parsed_fragment::TextDirective;
use crate::types::FileType;
use html5gum::{Token, Tokenizer};
use log::warn;
use regex::Regex;
pub(super) fn check_text_fragments(
directives: &[TextDirective],
content: &str,
file_type: FileType,
) -> bool {
if file_type != FileType::Html {
return true;
}
if directives.is_empty() {
return true;
}
let document = extract_visible_text(content);
directives
.iter()
.all(|directive| directive.matches(&document))
}
impl TextDirective {
fn to_regex(&self) -> Result<Regex, regex::Error> {
let mut regex_str = String::new();
if let Some(prefix) = &self.prefix {
regex_str.push_str(®ex::escape(&normalize_whitespace(prefix.trim())));
regex_str.push_str(r"\s*");
}
regex_str.push_str(®ex::escape(&normalize_whitespace(self.start.trim())));
if let Some(end) = &self.end {
regex_str.push_str(".+?"); regex_str.push_str(®ex::escape(&normalize_whitespace(end.trim())));
}
if let Some(suffix) = &self.suffix {
regex_str.push_str(r"\s*");
regex_str.push_str(®ex::escape(&normalize_whitespace(suffix.trim())));
}
Regex::new(®ex_str)
}
fn matches(&self, document: &str) -> bool {
match self.to_regex() {
Ok(regex) => regex.is_match(document),
Err(e) => {
warn!("Failed to create regex for text fragment {self:?}. {e:?}");
false
}
}
}
}
fn is_hidden_tag(name: &str) -> bool {
matches!(
name,
"head"
| "script"
| "style"
| "template"
| "iframe"
| "img"
| "meter"
| "object"
| "progress" | "video"
| "audio"
| "select"
)
}
fn extract_visible_text(input: &str) -> String {
fn push_space(text: &mut String) {
if !text.ends_with(char::is_whitespace) {
text.push(' ');
}
}
let mut text = String::new();
let mut hidden_stack: Vec<String> = Vec::new();
for Ok(token) in Tokenizer::new(input) {
match token {
Token::StartTag(tag) => {
let tag_name = String::from_utf8_lossy(&tag.name).into_owned();
if is_hidden_tag(&tag_name) {
hidden_stack.push(tag_name);
}
}
Token::EndTag(tag) => {
let tag_name = String::from_utf8_lossy(&tag.name).into_owned();
if hidden_stack.last().is_some_and(|last| last == &tag_name) {
hidden_stack.pop();
}
}
Token::String(value) if hidden_stack.is_empty() => {
let string = String::from_utf8_lossy(&value);
if string.starts_with(char::is_whitespace) {
push_space(&mut text);
}
text.extend(intersperse_whitespace(&string));
if string.ends_with(char::is_whitespace) {
push_space(&mut text);
}
}
_ => { }
}
}
text
}
fn intersperse_whitespace(text: &str) -> impl Iterator<Item = &str> {
text.split_whitespace().enumerate().flat_map(|(i, word)| {
let space: Option<&str> = (i > 0).then_some(" ");
space.into_iter().chain(iter::once(word))
})
}
fn normalize_whitespace(text: &str) -> String {
intersperse_whitespace(text).collect()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::utils::fragment_checker::parsed_fragment::ParsedFragment;
use url::Url;
const INDEX_HTML: &str = include_str!("../../../../fixtures/text_fragments/index.html");
#[test]
fn extracts_visible_text_without_style_or_attributes() {
let text = extract_visible_text(INDEX_HTML);
assert!(text.contains("Sed porta nisl sit amet quam ornare rutrum."));
assert!(
text.contains("Proin vulputate mi id sem pulvinar euismod."),
"{}",
text
);
assert!(!text.contains("my-style-property"));
assert!(!text.contains("my-element-attribute-value"));
}
#[test]
fn extract_visible_text_capital_tags() {
assert!(!extract_visible_text("<STYLE> inside </STYLE>").contains("inside"));
assert!(!extract_visible_text("<STYLE> inside </style>").contains("inside"));
assert!(extract_visible_text("<STYLE> inside </style> after").contains("after"));
assert!(extract_visible_text("<style> inside </STYLE> after").contains("after"));
}
#[test]
fn extract_visible_text_whitespace_implied_by_adjacent_tags() {
assert!(
extract_visible_text("a<span>b</span>").contains("ab"),
"span is an inline tag, so should /not/ create whitespace"
);
}
#[test]
fn extract_visible_text_alternative_whitespaces() {
assert!(
extract_visible_text("a\n\t b").contains("a b"),
"all spaces should be collapsed"
);
assert!(
extract_visible_text("a b").contains("a b"),
"encoded space should be interpreted as space"
);
assert!(
extract_visible_text("a\u{00A0}b").contains("a b"),
"inline nbsp should also be interpreted as space"
);
}
#[test]
fn check_text_fragments_from_chrome() {
let url =
Url::parse("http://localhost:8080/#:~:text=Proin-,vulputate,-mi%20id%20sem").unwrap();
let parsed = ParsedFragment::parse(&url);
assert!(check_text_fragments(
&parsed.text_directives,
INDEX_HTML,
FileType::Html
));
let url =
Url::parse("http://localhost:8080/#:~:text=Proin-,vulputate,-mi%20id%20sema").unwrap();
let parsed = ParsedFragment::parse(&url);
assert!(!check_text_fragments(
&parsed.text_directives,
INDEX_HTML,
FileType::Html
));
let url = Url::parse("http://localhost:8080/#:~:text=massa.,Proin").unwrap();
let parsed = ParsedFragment::parse(&url);
assert!(check_text_fragments(
&parsed.text_directives,
INDEX_HTML,
FileType::Html
));
let url =
Url::parse("http://localhost:8080/#:~:text=sit%20amet%20dignissim-,massa,-.").unwrap();
let parsed = ParsedFragment::parse(&url);
assert!(check_text_fragments(
&parsed.text_directives,
INDEX_HTML,
FileType::Html
));
let url =
Url::parse("http://localhost:8080/#:~:text=sit%20amet%20dignissim-,massa,-.").unwrap();
let parsed = ParsedFragment::parse(&url);
assert!(check_text_fragments(
&parsed.text_directives,
INDEX_HTML,
FileType::Html
));
let url =
Url::parse("http://localhost:8080/#:~:text=sit%20amet%20dignissim-,massam,-.").unwrap();
let parsed = ParsedFragment::parse(&url);
assert!(!check_text_fragments(
&parsed.text_directives,
INDEX_HTML,
FileType::Html
));
}
#[test]
fn check_text_fragments_alternative_whitespaces() {
let url = Url::parse("http://127.0.0.1:8000/a.html#:~:text=b%C2%A0cd").unwrap();
let parsed = ParsedFragment::parse(&url);
assert!(
check_text_fragments(&parsed.text_directives, "b\u{00a0}cd", FileType::Html),
"percent encoded nbsp in fragment should be decoded"
);
let url = Url::parse("http://127.0.0.1:8000/a.html#:~:text=b%20cd").unwrap();
let parsed = ParsedFragment::parse(&url);
assert!(
check_text_fragments(&parsed.text_directives, "b\u{00a0}cd", FileType::Html),
"%20 space in fragment should match any space in the text"
);
}
#[test]
fn check_text_fragments_prefix_and_suffix() {
let url = Url::parse(
"https://en.wikipedia.org/wiki/Most_common_words_in_English#:~:text=in-,the,the,-texts",
)
.unwrap();
let parsed = ParsedFragment::parse(&url);
let html =
"<p>written in the English language.</p>\n\n<p>In total, the texts</p>";
assert!(
check_text_fragments(&parsed.text_directives, html, FileType::Html),
"whitespace should be skipped between the match and prefix/suffix"
);
let html = "<p>in</p><p>the English language.</p>\n<p>In total, the</p><p>texts</p>";
assert!(
check_text_fragments(&parsed.text_directives, html, FileType::Html),
"prefix/suffix should match across block tags"
);
}
#[test]
fn check_text_fragments_multiple_occurrences() {
let url =
Url::parse("https://en.wikipedia.org/wiki/#:~:text=prefix-,start,end,-suffix").unwrap();
let parsed = ParsedFragment::parse(&url);
let html = "start [ prefix start end suffix ]";
assert!(
check_text_fragments(&parsed.text_directives, html, FileType::Html),
"should work with multiple occurrences of start, only one has prefix"
);
let html = "[ prefix start end end suffix ]";
assert!(
check_text_fragments(&parsed.text_directives, html, FileType::Html),
"should work with multiple occurrences of end, only one has suffix"
);
let html = "start [ prefix start end end suffix ]";
assert!(
check_text_fragments(&parsed.text_directives, html, FileType::Html),
"should work with multiple occurrences of both start and end"
);
}
}