html-filter 0.2.3

Parse HTML into a typed tree, then search for tags, attributes, classes, filter out comments or find and extract the exact data you want with a short builder pattern - zero dependencies, zero overhead
Documentation
//! Keeps track of rules applied on attributes or tags. They can
//! either be blacklisted or whitelisted by the user. This module handles the
//! logic for the combination of these rules.

use std::collections::HashMap;

use crate::types::tag::Attribute;

/// Stores the status of an element, i.e., whether it ought to be kept or
/// removed.
///
/// This contains only the explicit rules given by the user at the definition of
/// [`super::Filter`].
///
/// It contains a `whitelist` and a `blacklist` to keep track of the filtering
/// parameters.
#[derive(Debug, PartialEq, Eq)]
pub struct BlackWhiteList {
    /// Default behaviour
    ///
    /// Only is used when checking for emptiness
    default: bool,
    /// Contains the elements and their status
    ///
    /// The hashmap maps a name to a target, and a bool. The boolean is `true`
    /// if the item is whitelisted, and `false` if the item is blacklisted.
    ///
    /// The hashmap will never contain a lot of elements as it is bound by the
    /// number of valid html tags in practice, so a vec is better, as it
    /// supports const behaviour.
    items: Vec<(String, bool)>,
    /// Indicates if a whitelisted element was pushed into the [`HashMap`].
    whitelist_empty: bool,
}

impl BlackWhiteList {
    /// Returns the `keep` value associated to the name `name`.
    fn get(&self, name: &str) -> Option<bool> {
        self.items.iter().find(|item| item.0 == name).map(|item| item.1)
    }

    /// Returns the `keep` value associated to the name `name` in a mutable way.
    fn get_mut(&mut self, name: &str) -> Option<&mut bool> {
        self.items.iter_mut().find(|item| item.0 == name).map(|item| &mut item.1)
    }
}

impl BlackWhiteList {
    /// Check the status of an element
    pub fn check(&self, name: &str) -> ElementState {
        self.get(name).map_or_else(
            || {
                if self.is_empty() && self.default {
                    ElementState::NotSpecified
                } else {
                    ElementState::BlackListed
                }
            },
            |keep| {
                if keep { ElementState::WhiteListed } else { ElementState::BlackListed }
            },
        )
    }

    /// Checks if no elements were specified
    pub const fn is_empty(&self) -> bool {
        self.whitelist_empty
    }

    /// Checks if a name was explicitly blacklisted
    pub fn is_explicitly_blacklisted(&self, name: &str) -> bool {
        self.get(name).map_or_else(|| !self.default, |keep| !keep)
    }

    /// Returns a default [`Self`]
    pub const fn new() -> Self {
        Self { default: true, items: vec![], whitelist_empty: true }
    }

    /// Pushes an element as whitelisted or blacklisted
    pub fn push(&mut self, name: String, keep: bool) -> Result<(), ()> {
        if keep {
            self.whitelist_empty = false;
        }
        if let Some(item) = self.get_mut(&name) {
            let old = *item;
            *item = keep;
            if keep == old { Ok(()) } else { Err(()) }
        } else {
            self.items.push((name, keep));
            Ok(())
        }
    }

    /// Sets the default rule
    ///
    /// If no rule is specified for the given tag, default is applied.
    pub const fn set_default(&mut self, default: bool) {
        self.default = default;
    }
}

impl Default for BlackWhiteList {
    fn default() -> Self {
        Self::new()
    }
}

/// Status of an element
///
/// An element can be whitelisted or blacklisted by the user. This state
/// contains both information.
#[derive(Debug)]
pub enum ElementState {
    /// Element ought to be removed
    BlackListed,
    /// No rules applied for this element
    NotSpecified,
    /// Element ought to be kept
    WhiteListed,
}

impl ElementState {
    /// Computes the output status for multiple checks
    ///
    /// This is used to perform multiple successive tests.
    pub const fn and(&self, other: &Self) -> Self {
        match (self, other) {
            (Self::BlackListed, _) | (_, Self::BlackListed) => Self::BlackListed,
            (Self::NotSpecified, Self::NotSpecified) => Self::NotSpecified,
            // in this arm, at least one is WhiteListed, because the other case is above.
            (Self::WhiteListed | Self::NotSpecified, Self::WhiteListed | Self::NotSpecified) =>
                Self::WhiteListed,
        }
    }

    /// Checks if an element was explicitly authorised, i.e., is whitelisted
    pub const fn is_allowed_or(&self, default: bool) -> bool {
        match self {
            Self::BlackListed => false,
            Self::NotSpecified => default,
            Self::WhiteListed => true,
        }
    }
}

/// Ways to match an attribute's value to decide whether to keep the tag or not.
#[derive(Debug, PartialEq, Eq)]
pub enum AttributeMatch {
    /// The tag's value must contain a word equal to the given string.
    Contains(String),
    /// The tag's value must be exactly the given string.
    Is(String),
    /// The tag must not have a value.
    NoValue,
}

impl AttributeMatch {
    /// Checks if a [`AttributeMatch`] is satisfied by a given attribute value.
    fn matches(&self, attribute_value: Option<&str>) -> bool {
        attribute_value.map_or(matches!(self, Self::NoValue), |attr_val| match self {
            Self::Is(this_val) => *this_val == *attr_val,
            Self::Contains(this_val) => attr_val.split_whitespace().any(|word| word == this_val),
            Self::NoValue => false,
        })
    }
}

/// Rules for associating names to values
// TODO: could add a default to create a method: exact_attributes
#[derive(Default, Debug, PartialEq, Eq)]
pub struct ValueAssociateHash {
    /// Names and attributes explicitly not wanted
    blacklist: Vec<(String, AttributeMatch)>,
    /// Names and attributes explicitly wanted
    whitelist: Vec<(String, AttributeMatch)>,
}

impl ValueAssociateHash {
    /// Checks if the attributes form a correct combination of rules
    pub fn check(&self, attrs: &[Attribute]) -> ElementState {
        let attrs_map: HashMap<_, _> =
            attrs.iter().map(|attr| (attr.as_name().clone(), attr.as_value())).collect();
        for (wanted_name, wanted_value) in &self.whitelist {
            match attrs_map.get(wanted_name) {
                None => return ElementState::BlackListed,
                Some(found_value) if !wanted_value.matches(found_value.map(String::as_str)) =>
                    return ElementState::BlackListed,
                Some(_) => (),
            }
        }
        for (wanted_name, wanted_value) in &self.blacklist {
            match attrs_map.get(wanted_name) {
                Some(found_value) if wanted_value.matches(found_value.map(String::as_str)) =>
                    return ElementState::BlackListed,
                Some(_) | None => (),
            }
        }
        if self.is_empty() { ElementState::NotSpecified } else { ElementState::WhiteListed }
    }

    /// Checks if the [`ValueAssociateHash`] wasn't given any rules.
    pub const fn is_empty(&self) -> bool {
        self.whitelist.is_empty() && self.blacklist.is_empty()
    }

    /// Checks if one of the attributes was explicitly blacklisted
    pub fn is_explicitly_blacklisted(&self, attrs: &[Attribute]) -> bool {
        let blacklist =
            self.blacklist.iter().map(|(name, value)| (name, value)).collect::<HashMap<_, _>>();
        for attr in attrs {
            if let Some(value) = blacklist.get(&attr.as_name().clone())
                && value.matches(attr.as_value().map(String::as_str))
            {
                return true;
            }
        }
        false
    }

    /// Returns a default [`Self`].
    pub const fn new() -> Self {
        Self { blacklist: vec![], whitelist: vec![] }
    }

    /// Adds a rule for the attribute `name`
    pub fn push(&mut self, name: String, value: AttributeMatch, keep: bool) {
        let () = if keep {
            self.whitelist.push((name, value));
        } else {
            self.blacklist.push((name, value));
        };
    }
}