use super::{FeedFilter, FeedFilterConfig};
use ego_tree::{NodeId, NodeMut};
use regex::{Regex, RegexBuilder, RegexSet, RegexSetBuilder};
use scraper::{Html, Node};
use serde::{Deserialize, Serialize};
use crate::{
html::fragment_root_node_id,
util::{ConfigError, Result},
};
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct HighlightConfig {
#[serde(flatten)]
keywords: KeywordsOrPatterns,
bg_color: Option<String>,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(untagged)]
enum KeywordsOrPatterns {
Keywords {
keywords: Vec<String>,
},
Patterns {
patterns: serde_regex::Serde<Vec<Regex>>,
},
}
impl KeywordsOrPatterns {
fn into_patterns(self) -> Result<Vec<String>> {
match self {
Self::Keywords { keywords } => {
let patterns = keywords
.into_iter()
.map(|k| regex::escape(&k))
.collect::<Vec<_>>();
Ok(patterns)
}
Self::Patterns { patterns } => {
let patterns = patterns
.into_inner()
.into_iter()
.map(|r| r.as_str().to_owned())
.collect();
Ok(patterns)
}
}
}
}
#[async_trait::async_trait]
impl FeedFilterConfig for HighlightConfig {
type Filter = Highlight;
async fn build(self) -> Result<Self::Filter> {
let patterns = self.keywords.into_patterns()?;
let bg_color = self.bg_color.unwrap_or_else(|| "#ffff00".into());
Highlight::new(&patterns, bg_color)
}
}
pub struct Highlight {
bg_color: String,
regexset: RegexSet,
patterns: Vec<Regex>,
}
enum TextSegment {
Text(String),
Highlight(String),
}
impl TextSegment {
fn insert(
self,
color: &str,
node: &mut NodeMut<'_, scraper::Node>,
) -> NodeId {
use scraper::node::Text;
match self {
Self::Text(text) => {
let new_node = Node::Text(Text { text: text.into() });
node.insert_after(new_node).id()
}
Self::Highlight(text) => {
let fragment = format!(
"<span style=\"background-color: {}\" class=\"rss-funnel-hl\">{}</span>",
color,
text
);
insert_sibling_fragment(node, &fragment)
}
}
}
}
impl Highlight {
fn new<T: AsRef<str>>(patterns: &[T], bg_color: String) -> Result<Self> {
let regexset = RegexSetBuilder::new(patterns)
.case_insensitive(true)
.build()
.map_err(ConfigError::from)?;
let patterns = patterns
.iter()
.map(|p| {
RegexBuilder::new(p.as_ref())
.case_insensitive(true)
.build()
.map_err(ConfigError::from)
.map_err(|e| e.into())
})
.collect::<Result<Vec<Regex>>>()?;
Ok(Self {
patterns,
regexset,
bg_color,
})
}
fn highlight_html(&self, description: &str) -> String {
let mut html = Html::parse_fragment(description);
let text_node_ids: Vec<NodeId> = html
.tree
.nodes()
.filter_map(|node| match node.value() {
Node::Text(_) => Some(node.id()),
_ => None,
})
.collect();
for node_id in text_node_ids {
let mut node = html.tree.get_mut(node_id).expect("unreachable");
self.highlight_text_node(&mut node);
}
html.html()
}
fn highlight_text_node(&self, node: &mut NodeMut<'_, Node>) {
let text = match node.value() {
Node::Text(text) => text.to_string(),
_ => return,
};
if !self.regexset.is_match(&text) {
return;
}
let segments = self.segmentize_text(&text);
match node.value() {
Node::Text(text) => text.text.clear(),
_ => return,
};
let mut next_node_id = node.id();
for segment in segments {
let mut node = node.tree().get_mut(next_node_id).unwrap();
next_node_id = segment.insert(&self.bg_color, &mut node);
}
}
fn segmentize_text(&self, text: &str) -> Vec<TextSegment> {
let mut cursor = 0;
let mut out = vec![];
while cursor < text.len() {
let set_matches = self.regexset.matches_at(text, cursor);
if !set_matches.matched_any() {
break;
}
let m = set_matches
.iter()
.map(|i| {
let m = self.patterns[i]
.find_at(text, cursor)
.expect("regex match failed");
(m.start(), m)
})
.min_by_key(|(start, _)| *start)
.map(|(_, m)| m)
.into_iter()
.next()
.expect("regex match failed");
if m.start() > cursor {
out.push(TextSegment::Text(text[cursor..m.start()].into()));
}
out.push(TextSegment::Highlight(text[m.start()..m.end()].into()));
cursor = m.end();
}
if cursor < text.len() {
out.push(TextSegment::Text(text[cursor..].into()));
}
out
}
}
#[async_trait::async_trait]
impl FeedFilter for Highlight {
async fn run(&self, feed: &mut crate::feed::Feed) -> Result<()> {
let mut posts = feed.take_posts();
for post in &mut posts {
if let Some(description) = post.description_mut() {
*description = self.highlight_html(description);
}
}
feed.set_posts(posts);
Ok(())
}
}
fn insert_sibling_fragment(
node: &mut NodeMut<'_, Node>,
fragment: &str,
) -> NodeId {
let new_tree = scraper::Html::parse_fragment(fragment).tree;
let new_root = node.tree().extend_tree(new_tree);
let root_node_id = fragment_root_node_id(new_root.into());
node.insert_id_after(root_node_id).id()
}
#[cfg(test)]
mod test {
use crate::test_utils::assert_filter_parse;
use super::*;
#[test]
fn test_highlighting() {
let keywords = vec!["foo", "bar"];
let highlight = Highlight::new(&keywords, "#ffff00".into())
.expect("failed to build highlighter");
let html = r#"<html><p class="foo">FOO<div><!-- bar -->foo<br> bar</div></p></html>
"#;
let actual = highlight.highlight_html(html);
let expected = r#"<html><p class="foo"><span class="rss-funnel-hl" style="background-color: #ffff00">FOO</span><div><!-- bar --><span class="rss-funnel-hl" style="background-color: #ffff00">foo</span><br> <span style="background-color: #ffff00" class="rss-funnel-hl">bar</span></div></p></html>
"#;
assert_eq!(
Html::parse_fragment(&actual).tree,
Html::parse_fragment(expected).tree
);
}
#[test]
fn test_parse_config() {
assert_filter_parse(
r#"
highlight:
keywords:
- foo
- bar
bg_color: '#ffff00'
"#,
HighlightConfig {
keywords: KeywordsOrPatterns::Keywords {
keywords: vec!["foo".into(), "bar".into()],
},
bg_color: Some("#ffff00".into()),
},
);
assert_filter_parse(
r#"
highlight:
patterns:
- '\bfoo\b'
bg_color: '#ffff00'
"#,
HighlightConfig {
keywords: KeywordsOrPatterns::Patterns {
patterns: serde_regex::Serde(vec![Regex::new(r"\bfoo\b").unwrap()]),
},
bg_color: Some("#ffff00".into()),
},
);
}
}