url-sanitize-core 2.0.2

Pure-Rust library for removing tracking parameters and unwrapping tracking redirects.
Documentation
use regex_lite::Regex;

use crate::sanitize::sanitize_with;
use crate::types::{
    RedirectMatchPart, RedirectPrependScheme, RedirectTargetEncoding, RuleKind, RuleSource,
    SanitizeResult, SanitizerOptions, SanitizerRule,
};

#[derive(Debug)]
pub struct CompileError(pub String);

impl std::fmt::Display for CompileError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(&self.0)
    }
}

impl std::error::Error for CompileError {}

pub(crate) struct CompiledRule {
    pub source: RuleSource,
    pub provider: String,
    pub kind: RuleKind,
    pub url_pattern: Option<Regex>,
    pub exceptions: Vec<Regex>,
    pub body: CompiledBody,
}

pub(crate) enum CompiledBody {
    StripParam {
        param_pattern: Regex,
        value_pattern: Option<Regex>,
        is_referral_marketing: bool,
    },
    RawReplace {
        pattern: Regex,
        replacement: String,
    },
    UnwrapRedirect {
        pattern: Regex,
        capture_group: usize,
        match_part: RedirectMatchPart,
        target_encoding: RedirectTargetEncoding,
        prepend_scheme: Option<RedirectPrependScheme>,
        target_template: Option<String>,
    },
    BlockDomain,
}

pub struct Sanitizer {
    pub(crate) rules: Vec<CompiledRule>,
    pub(crate) options: SanitizerOptions,
}

fn compile_one(r: &SanitizerRule) -> Option<CompiledRule> {
    fn try_re(pat: &str) -> Option<Regex> {
        Regex::new(pat).ok()
    }
    fn try_re_ci(pat: &str) -> Option<Regex> {
        Regex::new(&format!("(?i){}", pat)).ok()
    }
    fn compile_exceptions(list: &[String]) -> Vec<Regex> {
        list.iter().filter_map(|p| try_re(p)).collect()
    }

    match r {
        SanitizerRule::StripParam {
            source,
            provider,
            url_pattern,
            param_pattern,
            value_pattern,
            exceptions,
            is_referral_marketing,
        } => {
            let pp = try_re_ci(&format!("^(?:{})$", param_pattern))?;
            let vp = match value_pattern.as_deref() {
                Some(p) => Some(try_re_ci(&format!("^(?:{})$", p))?),
                None => None,
            };
            let urlp = match url_pattern.as_deref() {
                Some(p) => Some(try_re_ci(p)?),
                None => None,
            };
            Some(CompiledRule {
                source: *source,
                provider: provider.clone(),
                kind: RuleKind::StripParam,
                url_pattern: urlp,
                exceptions: compile_exceptions(exceptions),
                body: CompiledBody::StripParam {
                    param_pattern: pp,
                    value_pattern: vp,
                    is_referral_marketing: *is_referral_marketing,
                },
            })
        }
        SanitizerRule::RawReplace {
            source,
            provider,
            url_pattern,
            pattern,
            replacement,
            exceptions,
        } => {
            let p = try_re_ci(pattern)?;
            let urlp = match url_pattern.as_deref() {
                Some(p) => Some(try_re_ci(p)?),
                None => None,
            };
            Some(CompiledRule {
                source: *source,
                provider: provider.clone(),
                kind: RuleKind::RawReplace,
                url_pattern: urlp,
                exceptions: compile_exceptions(exceptions),
                body: CompiledBody::RawReplace {
                    pattern: p,
                    replacement: replacement.clone(),
                },
            })
        }
        SanitizerRule::UnwrapRedirect {
            source,
            provider,
            url_pattern,
            pattern,
            capture_group,
            match_part,
            target_encoding,
            prepend_scheme,
            target_template,
            exceptions,
        } => {
            let p = try_re_ci(pattern)?;
            let urlp = match url_pattern.as_deref() {
                Some(p) => Some(try_re_ci(p)?),
                None => None,
            };
            Some(CompiledRule {
                source: *source,
                provider: provider.clone(),
                kind: RuleKind::UnwrapRedirect,
                url_pattern: urlp,
                exceptions: compile_exceptions(exceptions),
                body: CompiledBody::UnwrapRedirect {
                    pattern: p,
                    capture_group: *capture_group as usize,
                    match_part: *match_part,
                    target_encoding: *target_encoding,
                    prepend_scheme: *prepend_scheme,
                    target_template: target_template.clone(),
                },
            })
        }
        SanitizerRule::BlockDomain {
            source,
            provider,
            url_pattern,
            exceptions,
        } => {
            let urlp = try_re_ci(url_pattern)?;
            Some(CompiledRule {
                source: *source,
                provider: provider.clone(),
                kind: RuleKind::BlockDomain,
                url_pattern: Some(urlp),
                exceptions: compile_exceptions(exceptions),
                body: CompiledBody::BlockDomain,
            })
        }
    }
}

impl Sanitizer {
    pub fn compile(rules: &[SanitizerRule], options: SanitizerOptions) -> Sanitizer {
        let compiled: Vec<CompiledRule> = rules.iter().filter_map(compile_one).collect();
        Sanitizer {
            rules: compiled,
            options,
        }
    }

    pub fn sanitize(&self, input: &str) -> SanitizeResult {
        sanitize_with(&self.rules, &self.options, input)
    }
}