chromiumoxide/handler/blockers/
scripts.rs

1use crate::handler::blockers::Trie;
2
3lazy_static::lazy_static! {
4    /// Ignore list of scripts.
5    pub (crate) static ref URL_IGNORE_TRIE: Trie = {
6        let mut trie = Trie::new();
7        let patterns = [
8            "https://www.googletagservices.com/tag/",
9            "https://js.hs-analytics.net/analytics/",
10            "https://www.googletagmanager.com/gtag",
11            "https://www.googletagmanager.com/gtm.js",
12            "https://cm.g.doubleclick.net/",
13            "https://ads.pubmatic.com/AdServer/",
14            "https://js.hsadspixel.net",
15            "https://www.google.com/adsense/",
16            "https://www.googleadservices.com",
17            "https://static.cloudflareinsights.com/",
18            "https://adservice.google.com",
19            "https://www.gstatic.com/cv/js/sender/",
20            "https://googleads.g.doubleclick.net",
21            "https://www.google-analytics.com",
22            "https://www.googleanalytics.com",
23            "https://iabusprivacy.pmc.com/geo-info.js",
24            "https://cookie-cdn.cookiepro.com/consent",
25            "https://static.hotjar.com/",
26            "https://load.sumome.com/",
27            "https://www.mongoosemetrics.com/",
28            "https://geolocation-recommendations.shopifyapps.com/",
29            "https://w.usabilla.com/",
30            "https://consentcdn.cookiebot.com/",
31            "https://plausible.io/api/event",
32            "https://sentry.io/api/",
33            "https://cdn.onesignal.com",
34            "https://cdn.cookielaw.org/",
35            "https://static.doubleclick.net",
36            "https://tools.luckyorange.com/",
37            "https://cdn.piano.io",
38            "https://px.ads.linkedin.com",
39            "https://connect.facebook.net",
40            "https://tags.tiqcdn.com",
41            "https://tr.snapchat.com",
42            "https://ads.twitter.com",
43            "https://cdn.segment.com",
44            "https://stats.wp.com",
45            "https://analytics.",
46            "http://analytics.",
47            "https://cdn.cxense.com",
48            "https://cdn.tinypass.com",
49            "https://cd.connatix.com",
50            "https://js.chargebee.com/v2/chargebee.js",
51            "https://consent.cookiebot.com/",
52            "https://platform-api.sharethis.com/js/sharethis.js",
53            "https://js.hsforms.net/forms/embed/v2.js",
54            "https://static.parastorage.com/services/wix-thunderbolt/dist/",
55            "https://static.parastorage.com/services/tag-manager-client/",
56            "https://static.parastorage.com/services/form-app/",
57            "https://www.datadoghq-browser-agent.com/",
58            "https://tvem.cdn.turner.com/v2/",
59            "https://image6.pubmatic.com/AdServer/",
60            "https://i.cdn.turner.com/ads/adfuel/",
61            "https://featureassets.org",
62            "https://cdn.rudderlabs.com",
63            "https://script.hotjar.com/",
64            "https://cdn.branch.io/branch-latest.min.js",
65            "https://cdn.insurads.com/",
66            "https://cdn-ukwest.onetrust.com",
67            "https://cdn.onetrust.com",
68            "https://services.insurads.com/",
69            "https://platform.iteratehq.com/loader.js",
70            "https://connect.facebook.net/en_US/fbevents.js",
71            "https://acdn.adnxs.com/ast/ast.js",
72            "https://schibsted-cdn.relevant-digital.com/static/tags/",
73            "https://bat.bing.net",
74            "https://tpc.googlesyndication.com/",
75            "https://cdn.petametrics.com/",
76            "https://cdn.doubleverify.com/",
77            "https://www.facebook.com/v17.0/plugins/like.php?",
78            "https://gum.criteo.com",
79            "https://js-sec.indexww.com",
80            "https://eus.rubiconproject.com/",
81            "https://eb2.3lift.com/",
82            "https://acdn.adnxs.com/",
83            "https://ssc-cms.33across.com/",
84            "https://static.addtoany.com/menu/",
85            "https://www.gstatic.com/cast/sdk/libs/sender/1.0/cast_framework.js",
86            "https://www.gstatic.com/eureka/clank/131/cast_sender.js",
87            "https://static.adsafeprotected.com/",
88            "https://ssum-sec.casalemedia.com/usermatch",
89            "https://cdn.brandmetrics.com/scripts/",
90            "https://cdn.confiant-integrations.net/",
91            "https://www.b2i.us/b2i/",
92            "https://acsbapp.com/apps/app/dist/js/app.js",
93            "https://cdn.doofinder.com/livelayer/",
94            "https://load.sumo.com/",
95            "https://cdn11.bigcommerce.com/",
96            "https://na.shgcdn3.com/collector.js",
97            "https://microapps.bigcommerce.com/bodl-events/index.js",
98            "https://checkout-sdk.bigcommerce.com/v1/loader.js",
99            "https://cdn.callrail.com/companies/",
100            "https://www.webtraxs.com/trxscript.php",
101            "https://diffuser-cdn.app-us1.com/diffuser/diffuser.js",
102            "https://try.abtasty.com/",
103            "https://imasdk.googleapis.com/js/sdkloader/ima3.js",
104            "https://cdn.registerdisney.go.com/v4/responder.js",
105            "https://cdn.registerdisney.go.com/v4/OneID.js",
106            "https://js-agent.newrelic.com/",
107            "https://bat.bing.com/bat.js",
108            "https://s1.hdslb.com/bfs/cm/cm-sdk/static/js/track-collect.js",
109            "https://consent.trustarc.com/",
110            "https://cdn-sitegainer.com/",
111            "https://static.cloudflareinsights.com/beacon.min.js/",
112            "https://hm.baidu.com/hm.js",
113            "https://unpkg.zhimg.com/@efe/zhad-tracker",
114            "https://pagead2.googlesyndication.com/tag/js/gpt.js",
115            "https://mab.chartbeat.com/mab_strategy/",
116            "https://c.amazon-adsystem.com/",
117            "https://rumcdn.geoedge.be/",
118            "https://assets.adobedtm.com/extensions/",
119            "https://macro.adnami.io/macro/spec/adsm.macro.",
120            "https://log.medietall.no/analytics.js",
121            "https://cdn.siftscience.com/s.js",
122            "https://lwadm.com/lw/pbjs?",
123            "https://cl.k5a.io/",
124            "https://cdn-cookieyes.com/",
125            "https://pbs.yahoo.com/",
126            "https://ads.pubmatic.com/AdServer/js/",
127            "https://widgets.outbrain.com/nanoWidget/externals/obPixelFrame/obPixelFrame.js",
128            "https://widgets.outbrain.com/external/externals/intentiq.js",
129            "https://cdn.fuseplatform.net/publift/tags/",
130            "//d2wy8f7a9ursnm.cloudfront.net/v8/bugsnag.min.js",
131            ".sharethis.com",
132            ".newrelic.com",
133            ".googlesyndication.com",
134            ".amazon-adsystem.com",
135            ".onetrust.com",
136            "sc.omtrdc.net",
137            "doubleclick.net",
138            "hotjar.com",
139            "datadome.com",
140            "datadog-logs-us.js",
141            "tinypass.min.js",
142            ".airship.com",
143            ".adlightning.com",
144            ".lab.amplitude.",
145            // explicit ignore tracking.js and ad files
146            "privacy-notice.js",
147            "tracking.js",
148            "plugins/cookie-law-info/legacy/",
149            "ads.js",
150            "insight.min.js",
151            "assets/TrackingPixel",
152            "https://ads.",
153            "http://ads.",
154            ".pubmatic.com/AdServer/",
155            "https://tracking.",
156            "http://tracking.",
157            "https://static-tracking.",
158            // exp testin
159            // used for possible location outside
160            "https://geo.privacymanager.io/",
161            // "https://www.recaptcha.net/recaptcha/",
162            // "https://www.google.com/recaptcha/",
163            // "https://www.gstatic.com/recaptcha/",
164        ];
165        for pattern in &patterns {
166            trie.insert(pattern);
167        }
168        trie
169    };
170
171    /// Ignore list of scripts embedded or font extra.
172    pub(crate) static ref URL_IGNORE_EMBEDED_TRIE: Trie = {
173        let mut trie = Trie::new();
174        let patterns = [
175            "https://www.youtube.com/embed/",      // YouTube video embeds
176            "https://www.google.com/maps/embed?",  // Google Maps embeds
177            "https://maps.google.com", // Google maps iframe.
178            "https://player.vimeo.com/video/",     // Vimeo video embeds
179            "https://player.vimeo.com/api/player.js", // Vimeo video embeds
180            "https://open.spotify.com/embed/",     // Spotify music embeds
181            "https://w.soundcloud.com/player/",    // SoundCloud embeds
182            "https://platform.twitter.com/embed/", // Twitter embedded tweets
183            "https://www.instagram.com/embed.js",  // Instagram embeds
184            "https://www.facebook.com/plugins/",   // Facebook embeds (like posts and videos)
185            "https://cdn.embedly.com/widgets/",    // Embedly embeds
186            "https://player.twitch.tv/",           // Twitch video player embeds
187            "https://maps.googleapis.com/maps/", // Google map embeds
188            "https://www.youtube.com/player_api", // Youtube player.
189            "https://www.googletagmanager.com/ns.html", // Google tag manager.
190            "https://consentcdn.cookiebot.com", // Cookie bot
191            "https://www.youtube.com/iframe_api", // Youtube iframes.
192            "https://f.vimeocdn.com", // Vimeo EMBEDDINGS
193            "https://i.vimeocdn.com/",
194            "https://image2.pubmatic.com/AdServer/",
195            "https://ads.pubmatic.com/AdServer/js/",
196            "https://cdn.taboola.com/libtrc/static/topics/",
197            "https://pm-widget.taboola.com/",
198            "https://gum.criteo.com/syncframe",
199            // "https://www.youtube.com/s/player/", // Youtube player not needed usually since iframe_api is used mainly
200            // vercel live
201            "https://vercel.live/api/",
202
203            // extra CDN scripts
204            "https://cdn.readme.io/public/",
205            // font awesome
206            "https://use.fontawesome.com/",
207            // insight tracker
208            "https://insight.adsrvr.org/track/",
209            "http://www.google-analytics.com/ga.js",
210            "cxense.com/",
211            // snapchat tracker
212            "https://tr.snapchat.com/",
213            "https://buy.tinypass.com",
214            "https://nimbleplot.com/",
215            "https://my.actiondata.co/js/tracker.php",
216            "https://ajax.googleapis.com/ajax/libs/webfont/",
217            "http://cdn2.editmysite.com/",
218            // ignore font extras
219            "https://kit.fontawesome.com/",
220            "https://use.typekit.net",
221            ".amplitude.com",
222            ".rudderstack.com",
223            // ignore tailwind cdn
224            "https://cdn.tailwindcss.com",
225            // ignore extra ads
226            ".sharethis.com",
227            "amazon-adsystem.com",
228            ".vimeocdn.com",
229            "g.doubleclick.net",
230            "https://securepubads.g.doubleclick.net",
231            "googlesyndication.com",
232            "adsafeprotected.com",
233            // more google tracking
234            ".googlesyndication.com/safeframe/",
235            // repeat consent js
236            "/ccpa/user-consent.min.js",
237            "consent-manager",
238            "/cookiebanner/js/",
239            "cookielaw.org",
240            "bugsnag.min.js",
241            // privacy
242            "otBannerSdk.js",
243            "privacy-notice.js",
244            ".ingest.sentry.io/api",
245            // ignore amazon scripts for media
246            ".ssl-images-amazon.com/images/"
247        ];
248        for pattern in &patterns {
249            trie.insert(pattern);
250        }
251        trie
252    };
253
254    /// Ignore list of path scripts to ignore for tracking and analytics.
255    pub(crate) static ref URL_IGNORE_SCRIPT_BASE_PATHS: Trie = {
256        let mut trie = Trie::new();
257        let patterns = [
258            "wp-content/plugins/cookie-law-info",
259            "wp-content/js/rlt-proxy.js",
260            "wp-admin/rest-proxy/",
261            "wp-content/mu-plugins/a8c-analytics/",
262            "analytics/",
263            "cookie-tracking",
264        ];
265        for pattern in &patterns {
266            trie.insert(pattern);
267        }
268        trie
269    };
270
271    /// Ignore list of path scripts to ignore for themes.
272    pub (crate) static ref URL_IGNORE_SCRIPT_STYLES_PATHS: Trie = {
273        let mut trie = Trie::new();
274        let patterns = [
275            "wp-content/themes/",
276            "wp-content/plugins/dizo-image-hover/",
277            "wp-content/plugins/supreme-modules-pro-for-divi/",
278            "wp-content/plugins/page-builder-pmc/",
279            "wp-content/plugins/contact-form-7/",
280            "wp-content/plugins/responsive-lightbox/",
281            "wp-content/cache/breeze-minification/",
282            "wp-includes/js/mediaelement",
283            "wp-content/plugins/gravityforms/",
284            "wp-content/plugins/wp-rocket/assets/js/lazyload/",
285            "wp-content/plugins/w3-total-cache/",
286            "wp-content/js/bilmur.min.js",
287            "npm/bootstrap@"
288        ];
289        for pattern in &patterns {
290            trie.insert(pattern);
291        }
292        trie
293    };
294
295    /// Ignore list of scripts paths.
296    pub (crate) static ref URL_IGNORE_TRIE_PATHS: Trie = {
297        let mut trie = Trie::new();
298        let patterns = [
299            // explicit ignore tracking.js and ad files
300            "privacy-notice.js",
301            "tracking.js",
302            "track.js",
303            "ads.js",
304            "analytics.js",
305            "otSDKStub.js",
306            "otBannerSdk.js",
307            "_vercel/insights/script.js",
308            "analytics.",
309        ];
310        for pattern in &patterns {
311            trie.insert(pattern);
312        }
313        trie
314    };
315
316}
317
318#[cfg(test)]
319mod tests {
320    use super::*;
321
322    #[test]
323    fn test_url_ignore_trie_contains() {
324        // Positive tests - these URLs should be contained in the trie
325        let positive_cases = vec![
326            "https://www.googletagservices.com/tag/",
327            "https://www.google-analytics.com",
328            "https://www.googleanalytics.com",
329            ".newrelic.com",
330            "privacy-notice.js",
331        ];
332
333        // Negative tests - these URLs should not be contained in the trie
334        let negative_cases = vec![
335            "https://not-a-tracked-url.com/script.js",
336            "https://google.com",
337        ];
338
339        for case in positive_cases {
340            assert!(
341                URL_IGNORE_TRIE.contains_prefix(case),
342                "Trie should contain: {}",
343                case
344            );
345        }
346
347        for case in negative_cases {
348            assert!(
349                !URL_IGNORE_TRIE.contains_prefix(case),
350                "Trie should not contain: {}",
351                case
352            );
353        }
354    }
355
356    #[test]
357    fn test_url_ignore_embedded_trie_contains() {
358        // Positive tests - these URLs should be contained in the trie
359        let positive_cases = vec![
360            "https://www.youtube.com/embed/",
361            "https://www.google.com/maps/embed?",
362            ".amplitude.com",
363        ];
364
365        // Negative tests - these URLs should not be contained in the trie
366        let negative_cases = vec![
367            "https://secure-site.com/resource.js",
368            "https://example.com/embed.js",
369        ];
370
371        for case in positive_cases {
372            assert!(
373                URL_IGNORE_EMBEDED_TRIE.contains_prefix(case),
374                "Trie should contain: {}",
375                case
376            );
377        }
378
379        for case in negative_cases {
380            assert!(
381                !URL_IGNORE_EMBEDED_TRIE.contains_prefix(case),
382                "Trie should not contain: {}",
383                case
384            );
385        }
386    }
387
388    #[test]
389    fn test_url_ignore_script_base_paths_contains() {
390        // Positive tests - these paths should be contained in the trie
391        let positive_cases = vec!["wp-content/plugins/cookie-law-info", "analytics/"];
392
393        // Negative tests - these paths should not be contained in the trie
394        let negative_cases = vec![
395            "wp-content/some-untracked-plugin/",
396            "random/path/analytics.js",
397        ];
398
399        for case in positive_cases {
400            assert!(
401                URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(case),
402                "Trie should contain: {}",
403                case
404            );
405        }
406
407        for case in negative_cases {
408            assert!(
409                !URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(case),
410                "Trie should not contain: {}",
411                case
412            );
413        }
414    }
415
416    #[test]
417    fn test_url_ignore_script_style_paths_contains() {
418        // Positive tests - these paths should be contained in the trie
419        let positive_cases = vec!["wp-content/themes/", "npm/bootstrap@"];
420
421        // Negative tests - these paths should not be contained in the trie
422        let negative_cases = vec![
423            "wp-content/some-other-theme/",
424            "wp-content/plugins/untracked-plugin/",
425        ];
426
427        for case in positive_cases {
428            assert!(
429                URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(case),
430                "Trie should contain: {}",
431                case
432            );
433        }
434
435        for case in negative_cases {
436            assert!(
437                !URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(case),
438                "Trie should not contain: {}",
439                case
440            );
441        }
442    }
443
444    #[test]
445    fn test_url_ignore_trie_paths_contains() {
446        // Positive tests - these paths should be contained in the trie
447        let positive_cases = vec!["privacy-notice.js", "tracking.js"];
448
449        // Negative tests - these paths should not be contained in the trie
450        let negative_cases = vec!["non-ignored.js", "non-related/tracking.js"];
451
452        for case in positive_cases {
453            assert!(
454                URL_IGNORE_TRIE_PATHS.contains_prefix(case),
455                "Trie should contain: {}",
456                case
457            );
458        }
459
460        for case in negative_cases {
461            assert!(
462                !URL_IGNORE_TRIE_PATHS.contains_prefix(case),
463                "Trie should not contain: {}",
464                case
465            );
466        }
467    }
468}