adblock 0.12.2

Native Rust module for Adblock Plus syntax (e.g. EasyList, EasyPrivacy) filter parsing and matching.
Documentation
//! Compiled regexes can take up large amounts of memory. To reduce the overall memory footprint of
//! the [`crate::Engine`], infrequently used regexes can be discarded. The [`RegexManager`] is
//! responsible for managing the storage of regexes used by filters.

use crate::filters::network::{NetworkFilterMask, NetworkFilterMaskHelper};

use regex::{
    bytes::Regex as BytesRegex, bytes::RegexBuilder as BytesRegexBuilder,
    bytes::RegexSet as BytesRegexSet, bytes::RegexSetBuilder as BytesRegexSetBuilder, Regex,
};
use std::sync::LazyLock;

use std::collections::HashMap;
use std::fmt;
use std::time::Duration;

#[cfg(test)]
#[cfg(not(target_arch = "wasm32"))]
use mock_instant::thread_local::Instant;
#[cfg(not(test))]
#[cfg(not(target_arch = "wasm32"))]
use std::time::Instant;

#[cfg(target_arch = "wasm32")]
#[derive(Clone, Copy)]
pub struct Instant;
#[cfg(target_arch = "wasm32")]
impl Instant {
    pub fn now() -> Self {
        Self
    }
}

/// `*const NetworkFilter` could technically leak across threads through `RegexDebugEntry::id`, but
/// it's disguised as a unique identifier and not intended to be dereferenced.
unsafe impl Send for RegexManager {}

const DEFAULT_CLEAN_UP_INTERVAL: Duration = Duration::from_secs(30);
const DEFAULT_DISCARD_UNUSED_TIME: Duration = Duration::from_secs(180);

/// Reports [`RegexManager`] metrics that may be useful for creating an optimized
/// [`RegexManagerDiscardPolicy`].
#[cfg(feature = "debug-info")]
pub struct RegexDebugInfo {
    /// Information about each regex contained in the [`RegexManager`].
    pub regex_data: Vec<RegexDebugEntry>,
    /// Total count of compiled regexes.
    pub compiled_regex_count: usize,
}

/// Describes metrics about a single regex from the [`RegexManager`].
#[cfg(feature = "debug-info")]
pub struct RegexDebugEntry {
    /// Id for this particular regex, which is constant and unique for its lifetime.
    ///
    /// Note that there are no guarantees about a particular id's constancy or uniqueness beyond
    /// the lifetime of a corresponding regex.
    pub id: u64,
    /// A string representation of this regex, if available. It may be `None` if the regex has been
    /// cleaned up to conserve memory.
    pub regex: Option<String>,
    /// When this regex was last used.
    pub last_used: Instant,
    /// How many times this regex has been used.
    pub usage_count: usize,
}

#[derive(Debug, Clone)]
pub enum CompiledRegex {
    Compiled(BytesRegex),
    CompiledSet(BytesRegexSet),
    MatchAll,
    RegexParsingError(regex::Error),
}

impl CompiledRegex {
    pub fn is_match(&self, pattern: &str) -> bool {
        match &self {
            CompiledRegex::MatchAll => true, // simple case for matching everything, e.g. for empty filter
            CompiledRegex::RegexParsingError(_e) => false, // no match if regex didn't even compile
            CompiledRegex::Compiled(r) => r.is_match(pattern.as_bytes()),
            CompiledRegex::CompiledSet(r) => {
                // let matches: Vec<_> = r.matches(pattern).into_iter().collect();
                // println!("Matching {} against RegexSet: {:?}", pattern, matches);
                r.is_match(pattern.as_bytes())
            }
        }
    }
}

impl fmt::Display for CompiledRegex {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match &self {
            CompiledRegex::MatchAll => write!(f, ".*"), // simple case for matching everything, e.g. for empty filter
            CompiledRegex::RegexParsingError(_e) => write!(f, "ERROR"), // no match if regex didn't even compile
            CompiledRegex::Compiled(r) => write!(f, "{}", r.as_str()),
            CompiledRegex::CompiledSet(r) => write!(f, "{}", r.patterns().join(" | ")),
        }
    }
}

struct RegexEntry {
    regex: Option<CompiledRegex>,
    last_used: Instant,
    usage_count: usize,
}

/// Used for customization of regex discarding behavior in the [`RegexManager`].
pub struct RegexManagerDiscardPolicy {
    /// The [`RegexManager`] will check for and cleanup unused filters on this interval.
    pub cleanup_interval: Duration,
    /// The [`RegexManager`] will discard a regex if it hasn't been used for this much time.
    pub discard_unused_time: Duration,
}

impl Default for RegexManagerDiscardPolicy {
    fn default() -> Self {
        Self {
            cleanup_interval: DEFAULT_CLEAN_UP_INTERVAL,
            discard_unused_time: DEFAULT_DISCARD_UNUSED_TIME,
        }
    }
}

type RandomState = std::hash::BuildHasherDefault<seahash::SeaHasher>;

/// A manager that creates and stores all regular expressions used by filters.
/// Rarely used entries are discarded to save memory.
///
/// The [`RegexManager`] is not thread safe, so any access to it must be synchronized externally.
pub struct RegexManager {
    map: HashMap<u64, RegexEntry, RandomState>,
    compiled_regex_count: usize,
    now: Instant,
    #[cfg_attr(target_arch = "wasm32", allow(unused))]
    last_cleanup: Instant,
    discard_policy: RegexManagerDiscardPolicy,
}

impl Default for RegexManager {
    fn default() -> Self {
        Self {
            map: Default::default(),
            compiled_regex_count: 0,
            now: Instant::now(),
            last_cleanup: Instant::now(),
            discard_policy: Default::default(),
        }
    }
}

fn make_regexp<'a, FiltersIter>(mask: NetworkFilterMask, filters: FiltersIter) -> CompiledRegex
where
    FiltersIter: Iterator<Item = &'a str> + ExactSizeIterator,
{
    compile_regex(
        filters,
        mask.is_right_anchor(),
        mask.is_left_anchor(),
        mask.is_complete_regex(),
    )
}

/// Compiles a filter pattern to a regex. This is only performed *lazily* for
/// filters containing at least a * or ^ symbol. Because Regexes are expansive,
/// we try to convert some patterns to plain filters.
#[allow(clippy::trivial_regex)]
pub(crate) fn compile_regex<'a, I>(
    filters: I,
    is_right_anchor: bool,
    is_left_anchor: bool,
    is_complete_regex: bool,
) -> CompiledRegex
where
    I: Iterator<Item = &'a str> + ExactSizeIterator,
{
    // Escape special regex characters: |.$+?{}()[]\
    static SPECIAL_RE: LazyLock<Regex> =
        LazyLock::new(|| Regex::new(r"([\|\.\$\+\?\{\}\(\)\[\]])").unwrap());
    // * can match anything
    static WILDCARD_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*").unwrap());
    // ^ can match any separator or the end of the pattern
    static ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\^(.)").unwrap());
    // ^ can match any separator or the end of the pattern
    static ANCHOR_RE_EOL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\^$").unwrap());

    let mut escaped_patterns = Vec::with_capacity(filters.len());
    for filter_str in filters {
        // If any filter is empty, the entire set matches anything
        if filter_str.is_empty() {
            return CompiledRegex::MatchAll;
        }
        if is_complete_regex {
            // unescape unrecognised escaping sequences, otherwise a normal regex
            let unescaped = filter_str[1..filter_str.len() - 1]
                .replace("\\/", "/")
                .replace("\\:", ":");

            escaped_patterns.push(unescaped);
        } else {
            let repl = SPECIAL_RE.replace_all(filter_str, "\\$1");
            let repl = WILDCARD_RE.replace_all(&repl, ".*");
            // in adblock rules, '^' is a separator.
            // The separator character is anything but a letter, a digit, or one of the following: _ - . %
            let repl = ANCHOR_RE.replace_all(&repl, "(?:[^\\w\\d\\._%-])$1");
            let repl = ANCHOR_RE_EOL.replace_all(&repl, "(?:[^\\w\\d\\._%-]|$)");

            // Should match start or end of url
            let left_anchor = if is_left_anchor { "^" } else { "" };
            let right_anchor = if is_right_anchor { "$" } else { "" };
            let filter = format!("{left_anchor}{repl}{right_anchor}");

            escaped_patterns.push(filter);
        }
    }

    if escaped_patterns.is_empty() {
        CompiledRegex::MatchAll
    } else if escaped_patterns.len() == 1 {
        let pattern = &escaped_patterns[0];
        match BytesRegexBuilder::new(pattern).unicode(false).build() {
            Ok(compiled) => CompiledRegex::Compiled(compiled),
            Err(e) => {
                // println!("Regex parsing failed ({:?})", e);
                CompiledRegex::RegexParsingError(e)
            }
        }
    } else {
        match BytesRegexSetBuilder::new(escaped_patterns)
            .unicode(false)
            .build()
        {
            Ok(compiled) => CompiledRegex::CompiledSet(compiled),
            Err(e) => CompiledRegex::RegexParsingError(e),
        }
    }
}

impl RegexManager {
    /// Check whether or not a regex network filter matches a certain URL pattern, using the
    /// [`RegexManager`]'s managed regex storage.
    pub fn matches<'a, FiltersIter>(
        &mut self,
        mask: NetworkFilterMask,
        filters: FiltersIter,
        key: u64,
        pattern: &str,
    ) -> bool
    where
        FiltersIter: Iterator<Item = &'a str> + ExactSizeIterator,
    {
        if !mask.is_regex() && !mask.is_complete_regex() {
            return true;
        }
        use std::collections::hash_map::Entry;
        match self.map.entry(key) {
            Entry::Occupied(mut e) => {
                let v = e.get_mut();
                v.usage_count += 1;
                v.last_used = self.now;
                if v.regex.is_none() {
                    // A discarded entry, recreate it:
                    v.regex = Some(make_regexp(mask, filters));
                    self.compiled_regex_count += 1;
                }
                v.regex.as_ref().unwrap().is_match(pattern)
            }
            Entry::Vacant(e) => {
                self.compiled_regex_count += 1;
                let new_entry = RegexEntry {
                    regex: Some(make_regexp(mask, filters)),
                    last_used: self.now,
                    usage_count: 1,
                };
                e.insert(new_entry)
                    .regex
                    .as_ref()
                    .unwrap()
                    .is_match(pattern)
            }
        }
    }

    /// The [`RegexManager`] is just a struct and doesn't manage any worker threads, so this method
    /// must be called periodically to ensure that it can track usage patterns of regexes over
    /// time. This method will handle periodically discarding filters if necessary.
    #[cfg(not(target_arch = "wasm32"))]
    pub fn update_time(&mut self) {
        self.now = Instant::now();
        if !self.discard_policy.cleanup_interval.is_zero()
            && self.now - self.last_cleanup >= self.discard_policy.cleanup_interval
        {
            self.last_cleanup = self.now;
            self.cleanup();
        }
    }

    #[cfg(not(target_arch = "wasm32"))]
    pub(crate) fn cleanup(&mut self) {
        let now = self.now;
        for v in self.map.values_mut() {
            if now - v.last_used >= self.discard_policy.discard_unused_time {
                // Discard the regex to save memory.
                v.regex = None;
            }
        }
    }

    /// Customize the discard behavior of this [`RegexManager`].
    pub fn set_discard_policy(&mut self, new_discard_policy: RegexManagerDiscardPolicy) {
        self.discard_policy = new_discard_policy;
    }

    /// Discard one regex, identified by its id from a [`RegexDebugEntry`].
    #[cfg(feature = "debug-info")]
    pub fn discard_regex(&mut self, regex_id: u64) {
        self.map
            .iter_mut()
            .filter(|(k, _)| { **k } == regex_id)
            .for_each(|(_, v)| {
                v.regex = None;
            });
    }

    #[cfg(feature = "debug-info")]
    pub(crate) fn get_debug_regex_data(&self) -> Vec<RegexDebugEntry> {
        use itertools::Itertools;
        self.map
            .iter()
            .map(|(k, e)| RegexDebugEntry {
                id: { *k },
                regex: e.regex.as_ref().map(|x| x.to_string()),
                last_used: e.last_used,
                usage_count: e.usage_count,
            })
            .collect_vec()
    }

    #[cfg(feature = "debug-info")]
    pub(crate) fn get_compiled_regex_count(&self) -> usize {
        self.compiled_regex_count
    }

    /// Collect metrics that may be useful for creating an optimized [`RegexManagerDiscardPolicy`].
    #[cfg(feature = "debug-info")]
    pub fn get_debug_info(&self) -> RegexDebugInfo {
        RegexDebugInfo {
            regex_data: self.get_debug_regex_data(),
            compiled_regex_count: self.get_compiled_regex_count(),
        }
    }
}

#[cfg(test)]
#[path = "../tests/unit/regex_manager.rs"]
mod unit_tests;