rss-funnel 0.0.5

A composable feed processing pipeline
use regex::{Regex, RegexSet};
use serde::{Deserialize, Serialize};

use crate::util::{ConfigError, Result, SingleOrVec};

use super::{FeedFilter, FeedFilterConfig};

#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(transparent)]
pub struct KeepOnlyConfig(AnyMatchConfig);

#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(transparent)]
pub struct DiscardConfig(AnyMatchConfig);

#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(untagged)]
enum AnyMatchConfig {
  SingleContains(String),
  MultipleContains(Vec<String>),
  MatchConfig(MatchConfig),
}

#[derive(Serialize, Deserialize, Clone, Debug)]
struct MatchConfig {
  #[serde(default)]
  matches: SingleOrVec<serde_regex::Serde<Regex>>,
  #[serde(default)]
  contains: SingleOrVec<String>,
  #[serde(default)]
  field: Field,
  #[serde(default)]
  case_sensitive: bool,
}

impl Default for MatchConfig {
  fn default() -> Self {
    Self {
      matches: SingleOrVec::empty(),
      contains: SingleOrVec::empty(),
      field: Field::default(),
      case_sensitive: false,
    }
  }
}

impl AnyMatchConfig {
  fn into_match_config(self) -> MatchConfig {
    match self {
      Self::SingleContains(s) => MatchConfig {
        contains: SingleOrVec::Vec(vec![s]),
        ..Default::default()
      },
      Self::MultipleContains(v) => MatchConfig {
        contains: SingleOrVec::Vec(v),
        ..Default::default()
      },
      Self::MatchConfig(m) => m,
    }
  }
}

impl MatchConfig {
  fn regexes(&self) -> Vec<String> {
    let mut out = vec![];

    for m in &self.matches {
      out.push(m.as_str().to_string());
    }
    for p in &self.contains {
      out.push(regex::escape(p));
    }

    out
  }

  fn regex_set(&self) -> Result<RegexSet> {
    Ok(RegexSet::new(self.regexes()).map_err(ConfigError::from)?)
  }

  fn into_select(self, action: Action) -> Result<Select> {
    let needle = self.regex_set()?;
    let field = self.field;

    Ok(Select {
      needle,
      field,
      action,
    })
  }
}

#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
#[serde(rename_all = "snake_case")]
enum Field {
  Title,
  Content,
  Any,
}

impl Field {
  fn extract<'a>(&self, post: &'a crate::feed::Post) -> Vec<&'a str> {
    let vec = match self {
      Self::Title => vec![post.title()],
      Self::Content => vec![post.description()],
      Self::Any => {
        vec![post.title(), post.description()]
      }
    };

    vec.into_iter().flatten().collect()
  }
}

impl Default for Field {
  fn default() -> Self {
    Self::Any
  }
}

#[derive(Clone, Copy, Debug)]
enum Action {
  Include,
  Exclude,
}

#[async_trait::async_trait]
impl FeedFilterConfig for KeepOnlyConfig {
  type Filter = Select;

  async fn build(self) -> Result<Self::Filter> {
    self.0.into_match_config().into_select(Action::Include)
  }
}

#[async_trait::async_trait]
impl FeedFilterConfig for DiscardConfig {
  type Filter = Select;

  async fn build(self) -> Result<Self::Filter> {
    self.0.into_match_config().into_select(Action::Exclude)
  }
}

#[derive(Clone, Debug)]
pub struct Select {
  needle: RegexSet,
  field: Field,
  action: Action,
}

impl Select {
  fn matches(&self, haystack: &[&str]) -> bool {
    haystack.iter().any(|text| self.needle.is_match(text))
  }

  fn should_keep(&self, post: &crate::feed::Post) -> bool {
    let haystack = self.field.extract(post);
    let matches = self.matches(&haystack);

    match self.action {
      Action::Include => matches,
      Action::Exclude => !matches,
    }
  }
}

#[async_trait::async_trait]
impl FeedFilter for Select {
  async fn run(&self, feed: &mut crate::feed::Feed) -> Result<()> {
    let posts = feed.take_posts();
    let mut new_posts = vec![];

    for post in posts {
      if self.should_keep(&post) {
        new_posts.push(post);
      }
    }

    feed.set_posts(new_posts);
    Ok(())
  }
}

#[cfg(test)]
mod test {
  use super::*;
  use crate::test_utils::assert_filter_parse;

  #[test]
  fn test_config_keep_only_full() {
    let config = r#"
      keep_only:
        matches:
          - '\d+'
          - '\bfoo\b'
        field: title
        case_sensitive: true
    "#;

    let expected = KeepOnlyConfig(AnyMatchConfig::MatchConfig(MatchConfig {
      matches: SingleOrVec::Vec(vec![
        Regex::new(r"\d+").unwrap().into(),
        Regex::new(r"\bfoo\b").unwrap().into(),
      ]),
      contains: SingleOrVec::empty(),
      field: Field::Title,
      case_sensitive: true,
    }));

    assert_filter_parse(config, expected);
  }

  #[test]
  fn test_config_keep_only_single() {
    let config = r#"
      keep_only: foo
    "#;

    let expected = KeepOnlyConfig(AnyMatchConfig::SingleContains("foo".into()));

    assert_filter_parse(config, expected);
  }

  #[test]
  fn test_config_keep_only_multiple() {
    let config = r#"
        keep_only:
            - foo
            - bar
        "#;

    let expected = KeepOnlyConfig(AnyMatchConfig::MultipleContains(vec![
      "foo".into(),
      "bar".into(),
    ]));

    assert_filter_parse(config, expected);
  }

  #[test]
  fn test_config_discard_full() {
    let config = r#"
      discard:
        matches:
          - '\d+'
          - '\bfoo\b'
        field: title
        case_sensitive: true
    "#;

    let expected = DiscardConfig(AnyMatchConfig::MatchConfig(MatchConfig {
      matches: SingleOrVec::Vec(vec![
        Regex::new(r"\d+").unwrap().into(),
        Regex::new(r"\bfoo\b").unwrap().into(),
      ]),
      contains: SingleOrVec::empty(),
      field: Field::Title,
      case_sensitive: true,
    }));

    assert_filter_parse(config, expected);
  }
}