adblock 0.5.0

Native Rust module for Adblock Plus syntax (e.g. EasyList, EasyPrivacy) filter parsing and matching.
Documentation
use adblock::engine::Engine;

use serde::Deserialize;
use tokio::runtime::Runtime;

use std::fs::File;
use std::io::BufReader;

#[allow(non_snake_case)]
#[derive(Debug, Deserialize)]
struct RequestRuleMatch {
    url: String,
    sourceUrl: String,
    r#type: String,
    blocked: bool
}

fn load_requests() -> Vec<RequestRuleMatch> {
    let f = File::open("data/regressions.tsv").expect("file not found");
    let reader = BufReader::new(f);
    let mut rdr = csv::ReaderBuilder::new()
        .delimiter(b'\t')
        .from_reader(reader);

    let mut reqs: Vec<RequestRuleMatch> = Vec::new();
    for result in rdr.deserialize() {
        if result.is_ok() {
            let record: RequestRuleMatch = result.unwrap();
            reqs.push(RequestRuleMatch {
                url: record.url.trim_matches('"').to_owned(),
                sourceUrl: record.sourceUrl.trim_matches('"').to_owned(),
                r#type: record.r#type.trim_matches('"').to_owned(),
                blocked: record.blocked
            });
        } else {
            println!("Could not parse {:?}", result);
        }
    }

    reqs
}

/// Describes an online source of adblock rules.
#[derive(serde::Deserialize)]
pub struct RemoteFilterSource {
    pub uuid: String,
    pub url: String,
    pub title: String,
    pub format: adblock::lists::FilterFormat,
    pub support_url: String,
}

/// Fetch all filters once and store them in a lazy-loaded static variable to avoid unnecessary
/// network traffic.
static ALL_FILTERS: once_cell::sync::Lazy<adblock::lists::FilterSet> = once_cell::sync::Lazy::new(|| {
    async fn get_all_filters() -> adblock::lists::FilterSet {
        use futures::FutureExt;

        const DEFAULT_LISTS_URL: &'static str = "https://raw.githubusercontent.com/brave/adblock-resources/master/filter_lists/default.json";

        let default_lists: Vec<RemoteFilterSource> = async {
            let body = reqwest::get(DEFAULT_LISTS_URL).await.unwrap().text().await.unwrap();
            serde_json::from_str(&body).unwrap()
        }.await;

        let filters_fut: Vec<_> = default_lists
            .iter()
            .map(|list| {
                reqwest::get(&list.url)
                    .then(|resp| resp
                        .expect("Could not request rules")
                        .text()
                    ).map(move |text| (
                            list.format,
                            text.expect("Could not get rules as text")
                        )
                    )
            })
            .collect();

        let mut filter_set = adblock::lists::FilterSet::default();

        futures::future::join_all(filters_fut)
            .await
            .iter()
            .for_each(|(format, list)| {
                filter_set.add_filters(&list.lines().map(|s| s.to_owned()).collect::<Vec<_>>(), adblock::lists::ParseOptions { format: *format, ..Default::default() });
            });

        filter_set
    }

    let async_runtime = Runtime::new().expect("Could not start Tokio runtime");
    async_runtime.block_on(get_all_filters())
});

fn get_blocker_engine() -> Engine {
    let mut engine = Engine::from_filter_set(ALL_FILTERS.clone(), true);

    engine.use_tags(&["fb-embeds", "twitter-embeds"]);

    engine
}

fn get_blocker_engine_deserialized() -> Engine {
    use futures::FutureExt;
    let async_runtime = Runtime::new().expect("Could not start Tokio runtime");

    let dat_url = "https://adblock-data.s3.brave.com/4/rs-ABPFilterParserData.dat";
    let resp_bytes_fut = reqwest::get(dat_url)
        .map(|e| e.expect("Could not request rules"))
        .then(|resp| resp.bytes());
    let dat = async_runtime
        .block_on(resp_bytes_fut)
        .expect("Could not get response as bytes");

    let mut engine = Engine::default();
    engine.deserialize(&dat).expect("Deserialization failed");
    engine.use_tags(&["fb-embeds", "twitter-embeds"]);
    engine
}

fn get_blocker_engine_deserialized_ios() -> Engine {
    use futures::FutureExt;
    let async_runtime = Runtime::new().expect("Could not start Tokio runtime");

    let list_url = "https://adblock-data.s3.brave.com/ios/latest.txt";
    let resp_text_fut = reqwest::get(list_url)
        .map(|resp| resp.expect("Could not request rules"))
        .then(|resp| resp.text());
    let filters: Vec<String> = async_runtime
        .block_on(resp_text_fut)
        .expect("Could not get rules as text")
        .lines()
        .map(|s| s.to_owned())
        .collect();
    
    let engine = Engine::from_rules_parametrised(&filters, Default::default(), true, false);
    engine
}

#[test]
fn check_live_specific_urls() {
    let mut engine = get_blocker_engine();
    {
        let checked = engine.check_network_urls(
            "https://static.scroll.com/js/scroll.js",
            "https://www.theverge.com/",
            "script");
        assert_eq!(checked.matched, false,
            "Expected match, got filter {:?}, exception {:?}",
            checked.filter, checked.exception);
    }
    {
        engine.disable_tags(&["twitter-embeds"]);
        let checked = engine.check_network_urls(
            "https://platform.twitter.com/widgets.js",
            "https://fmarier.github.io/brave-testing/social-widgets.html",
            "script");
        assert_eq!(checked.matched, true,
            "Expected no match, got filter {:?}, exception {:?}",
            checked.filter, checked.exception);
        engine.enable_tags(&["twitter-embeds"]);
    }
    {
        engine.disable_tags(&["twitter-embeds"]);
        let checked = engine.check_network_urls(
            "https://imagesrv.adition.com/banners/1337/files/00/0e/6f/09/000000945929.jpg?PQgSgs13hf1fw.jpg",
            "https://spiegel.de",
            "image");
        assert_eq!(checked.matched, true,
            "Expected match, got filter {:?}, exception {:?}",
            checked.filter, checked.exception);
        engine.enable_tags(&["twitter-embeds"]);
    }
}

#[test]
fn check_live_deserialized_specific_urls() {
    let mut engine = get_blocker_engine_deserialized();
    {
        engine.disable_tags(&["twitter-embeds"]);
        let checked = engine.check_network_urls(
            "https://platform.twitter.com/widgets.js",
            "https://fmarier.github.io/brave-testing/social-widgets.html",
            "script");
        assert_eq!(checked.matched, true,
            "Expected match, got filter {:?}, exception {:?}",
            checked.filter, checked.exception);
    }
    {
        engine.enable_tags(&["twitter-embeds"]);
        let checked = engine.check_network_urls(
            "https://platform.twitter.com/widgets.js",
            "https://fmarier.github.io/brave-testing/social-widgets.html",
            "script");
        assert_eq!(checked.matched, false,
            "Expected no match, got filter {:?}, exception {:?}",
            checked.filter, checked.exception);
    }
}

#[test]
fn check_live_from_filterlists() {
    let engine = get_blocker_engine();
    let requests = load_requests();
    
    for req in requests {
        let checked = engine.check_network_urls(&req.url, &req.sourceUrl, &req.r#type);
        assert_eq!(checked.matched, req.blocked,
            "Expected match {} for {} at {}, got filter {:?}, exception {:?}",
            req.blocked, req.url, req.sourceUrl, checked.filter, checked.exception);
    }
}

#[test]
fn check_live_deserialized_file() {
    let engine = get_blocker_engine_deserialized();
    let requests = load_requests();
    
    for req in requests {
        println!("Checking {:?}", req);
        let checked = engine.check_network_urls(&req.url, &req.sourceUrl, &req.r#type);
        assert_eq!(checked.matched, req.blocked,
            "Expected match {} for {} {} {}",
            req.blocked, req.url, req.sourceUrl, req.r#type);
    }
}

#[test]
#[ignore]
fn check_live_deserialized_ios() {
    let engine = get_blocker_engine_deserialized_ios();
    let requests = load_requests();
    
    for req in requests {
        let checked = engine.check_network_urls(&req.url, &req.sourceUrl, &req.r#type);
        assert_eq!(checked.matched, req.blocked,
            "Expected match {} for {} {} {}",
            req.blocked, req.url, req.sourceUrl, req.r#type);
    }
}

#[cfg(feature = "resource_assembler")]
#[test]
fn check_live_redirects() {
    use adblock::resources::resource_assembler::assemble_web_accessible_resources;

    let mut engine = get_blocker_engine();
    let redirect_engine_path = std::path::Path::new("data/test/fake-uBO-files/redirect-engine.js");
    let war_dir = std::path::Path::new("data/test/fake-uBO-files/web_accessible_resources");
    let resources = assemble_web_accessible_resources(war_dir, redirect_engine_path);

    engine.use_resources(&resources);
    { 
        let checked = engine.check_network_urls(
            "https://c.amazon-adsystem.com/aax2/amzn_ads.js",
            "https://aussieexotics.com/",
            "script");
        assert_eq!(checked.matched, true,
            "Expected match, got filter {:?}, exception {:?}",
            checked.filter, checked.exception);
        assert!(checked.redirect.is_some());
        // Check for the specific expected return script value in base64
        assert_eq!(checked.redirect.unwrap(), "data:application/javascript;base64,LyoqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioqKioKCiAgICB1QmxvY2sgT3JpZ2luIC0gYSBicm93c2VyIGV4dGVuc2lvbiB0byBibG9jayByZXF1ZXN0cy4KICAgIENvcHlyaWdodCAoQykgMjAxOS1wcmVzZW50IFJheW1vbmQgSGlsbAoKICAgIFRoaXMgcHJvZ3JhbSBpcyBmcmVlIHNvZnR3YXJlOiB5b3UgY2FuIHJlZGlzdHJpYnV0ZSBpdCBhbmQvb3IgbW9kaWZ5CiAgICBpdCB1bmRlciB0aGUgdGVybXMgb2YgdGhlIEdOVSBHZW5lcmFsIFB1YmxpYyBMaWNlbnNlIGFzIHB1Ymxpc2hlZCBieQogICAgdGhlIEZyZWUgU29mdHdhcmUgRm91bmRhdGlvbiwgZWl0aGVyIHZlcnNpb24gMyBvZiB0aGUgTGljZW5zZSwgb3IKICAgIChhdCB5b3VyIG9wdGlvbikgYW55IGxhdGVyIHZlcnNpb24uCgogICAgVGhpcyBwcm9ncmFtIGlzIGRpc3RyaWJ1dGVkIGluIHRoZSBob3BlIHRoYXQgaXQgd2lsbCBiZSB1c2VmdWwsCiAgICBidXQgV0lUSE9VVCBBTlkgV0FSUkFOVFk7IHdpdGhvdXQgZXZlbiB0aGUgaW1wbGllZCB3YXJyYW50eSBvZgogICAgTUVSQ0hBTlRBQklMSVRZIG9yIEZJVE5FU1MgRk9SIEEgUEFSVElDVUxBUiBQVVJQT1NFLiAgU2VlIHRoZQogICAgR05VIEdlbmVyYWwgUHVibGljIExpY2Vuc2UgZm9yIG1vcmUgZGV0YWlscy4KCiAgICBZb3Ugc2hvdWxkIGhhdmUgcmVjZWl2ZWQgYSBjb3B5IG9mIHRoZSBHTlUgR2VuZXJhbCBQdWJsaWMgTGljZW5zZQogICAgYWxvbmcgd2l0aCB0aGlzIHByb2dyYW0uICBJZiBub3QsIHNlZSB7aHR0cDovL3d3dy5nbnUub3JnL2xpY2Vuc2VzL30uCgogICAgSG9tZTogaHR0cHM6Ly9naXRodWIuY29tL2dvcmhpbGwvdUJsb2NrCiovCgooZnVuY3Rpb24oKSB7CiAgICAndXNlIHN0cmljdCc7CiAgICBpZiAoIGFtem5hZHMgKSB7CiAgICAgICAgcmV0dXJuOwogICAgfQogICAgdmFyIHcgPSB3aW5kb3c7CiAgICB2YXIgbm9vcGZuID0gZnVuY3Rpb24oKSB7CiAgICAgICAgOwogICAgfS5iaW5kKCk7CiAgICB2YXIgYW16bmFkcyA9IHsKICAgICAgICBhcHBlbmRTY3JpcHRUYWc6IG5vb3BmbiwKICAgICAgICBhcHBlbmRUYXJnZXRpbmdUb0FkU2VydmVyVXJsOiBub29wZm4sCiAgICAgICAgYXBwZW5kVGFyZ2V0aW5nVG9RdWVyeVN0cmluZzogbm9vcGZuLAogICAgICAgIGNsZWFyVGFyZ2V0aW5nRnJvbUdQVEFzeW5jOiBub29wZm4sCiAgICAgICAgZG9BbGxUYXNrczogbm9vcGZuLAogICAgICAgIGRvR2V0QWRzQXN5bmM6IG5vb3BmbiwKICAgICAgICBkb1Rhc2s6IG5vb3BmbiwKICAgICAgICBkZXRlY3RJZnJhbWVBbmRHZXRVUkw6IG5vb3BmbiwKICAgICAgICBnZXRBZHM6IG5vb3BmbiwKICAgICAgICBnZXRBZHNBc3luYzogbm9vcGZuLAogICAgICAgIGdldEFkRm9yU2xvdDogbm9vcGZuLAogICAgICAgIGdldEFkc0NhbGxiYWNrOiBub29wZm4sCiAgICAgICAgZ2V0RGlzcGxheUFkczogbm9vcGZuLAogICAgICAgIGdldERpc3BsYXlBZHNBc3luYzogbm9vcGZuLAogICAgICAgIGdldERpc3BsYXlBZHNDYWxsYmFjazogbm9vcGZuLAogICAgICAgIGdldEtleXM6IG5vb3BmbiwKICAgICAgICBnZXRSZWZlcnJlclVSTDogbm9vcGZuLAogICAgICAgIGdldFNjcmlwdFNvdXJjZTogbm9vcGZuLAogICAgICAgIGdldFRhcmdldGluZzogbm9vcGZuLAogICAgICAgIGdldFRva2Vuczogbm9vcGZuLAogICAgICAgIGdldFZhbGlkTWlsbGlzZWNvbmRzOiBub29wZm4sCiAgICAgICAgZ2V0VmlkZW9BZHM6IG5vb3BmbiwKICAgICAgICBnZXRWaWRlb0Fkc0FzeW5jOiBub29wZm4sCiAgICAgICAgZ2V0VmlkZW9BZHNDYWxsYmFjazogbm9vcGZuLAogICAgICAgIGhhbmRsZUNhbGxCYWNrOiBub29wZm4sCiAgICAgICAgaGFzQWRzOiBub29wZm4sCiAgICAgICAgcmVuZGVyQWQ6IG5vb3BmbiwKICAgICAgICBzYXZlQWRzOiBub29wZm4sCiAgICAgICAgc2V0VGFyZ2V0aW5nOiBub29wZm4sCiAgICAgICAgc2V0VGFyZ2V0aW5nRm9yR1BUQXN5bmM6IG5vb3BmbiwKICAgICAgICBzZXRUYXJnZXRpbmdGb3JHUFRTeW5jOiBub29wZm4sCiAgICAgICAgdHJ5R2V0QWRzQXN5bmM6IG5vb3BmbiwKICAgICAgICB1cGRhdGVBZHM6IG5vb3BmbgogICAgfTsKICAgIHcuYW16bmFkcyA9IGFtem5hZHM7CiAgICB3LmFtem5fYWRzID0gdy5hbXpuX2FkcyB8fCBub29wZm47CiAgICB3LmFheF93cml0ZSA9IHcuYWF4X3dyaXRlIHx8IG5vb3BmbjsKICAgIHcuYWF4X3JlbmRlcl9hZCA9IHcuYWF4X3JlbmRlcl9hZCB8fCBub29wZm47Cn0pKCk7Cg==")
    }
    {
        let checked = engine.check_network_urls(
            "https://www.googletagservices.com/tag/js/gpt.js",
            "https://winniethepooh.disney.com/",
            "script");
        assert_eq!(checked.matched, true,
            "Expected match, got filter {:?}, exception {:?}",
            checked.filter, checked.exception);
        assert!(checked.redirect.is_some());
    }
    
}

#[test]
/// Ensure that two different engines loaded from the same textual filter set serialize to
/// identical buffers.
fn stable_serialization() {
    let engine1 = Engine::from_filter_set(ALL_FILTERS.clone(), true);
    let ser1 = engine1.serialize_raw().unwrap();

    let engine2 = Engine::from_filter_set(ALL_FILTERS.clone(), true);
    let ser2 = engine2.serialize_raw().unwrap();

    assert_eq!(ser1, ser2);
}

#[test]
/// Ensure that one engine's serialization result can be exactly reproduced by another engine after
/// deserializing from it.
fn stable_serialization_through_load() {
    let engine1 = Engine::from_filter_set(ALL_FILTERS.clone(), true);
    let ser1 = engine1.serialize_raw().unwrap();

    let mut engine2 = Engine::new(true);
    engine2.deserialize(&ser1).unwrap();
    let ser2 = engine2.serialize_raw().unwrap();

    assert_eq!(ser1, ser2);
}