adblock/
lists.rs

1//! Parsing functions and collections for handling with multiple filter rules.
2
3use std::convert::TryFrom;
4
5use crate::filters::cosmetic::{CosmeticFilter, CosmeticFilterError};
6use crate::filters::network::{NetworkFilter, NetworkFilterError};
7use crate::resources::PermissionMask;
8
9use itertools::{Either, Itertools};
10use memchr::memchr as find_char;
11use serde::{Deserialize, Serialize};
12use thiserror::Error;
13
14/// Specifies rule types to keep during parsing.
15#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
16pub enum RuleTypes {
17    #[default]
18    All,
19    NetworkOnly,
20    CosmeticOnly,
21}
22
23impl RuleTypes {
24    pub fn loads_network_rules(&self) -> bool {
25        matches!(self, Self::All | Self::NetworkOnly)
26    }
27
28    pub fn loads_cosmetic_rules(&self) -> bool {
29        matches!(self, Self::All | Self::CosmeticOnly)
30    }
31}
32
33/// Options for tweaking how a filter or list of filters is interpreted when parsing. It's
34/// recommended to use _struct update syntax_ with a `default()` "rest" value; adding new fields to
35/// this struct will not be considered a breaking change.
36///
37/// ```
38/// # use adblock::lists::{FilterFormat, ParseOptions};
39/// let parse_options = ParseOptions {
40///     format: FilterFormat::Hosts,
41///     ..ParseOptions::default()
42/// };
43/// ```
44#[derive(Copy, Clone, Deserialize)]
45pub struct ParseOptions {
46    /// Assume filters are in the given format when parsing. Defaults to `FilterFormat::Standard`.
47    #[serde(default)]
48    pub format: FilterFormat,
49    /// Specifies rule types to keep during parsing. Defaults to `RuleTypes::All`. This can be used
50    /// to reduce the memory impact of engines that will only be used for cosmetic filtering or
51    /// network filtering, but not both. It can also be useful for iOS and macOS when exporting to
52    /// content-blocking syntax, as these platforms limit the number of content blocking rules that
53    /// can be loaded.
54    #[serde(default)]
55    pub rule_types: RuleTypes,
56    /// Specifies permissions to use when parsing a given filter list. See [`PermissionMask`] for
57    /// more info.
58    #[serde(default)]
59    pub permissions: PermissionMask,
60}
61
62impl Default for ParseOptions {
63    fn default() -> Self {
64        ParseOptions {
65            format: FilterFormat::Standard,
66            rule_types: RuleTypes::All,
67            permissions: PermissionMask::default(),
68        }
69    }
70}
71
72/// Manages a set of rules to be added to an [`crate::Engine`].
73///
74/// To be able to efficiently handle special options like `$badfilter`, and to allow optimizations,
75/// all rules must be available when the `Engine` is first created. `FilterSet` allows assembling a
76/// compound list from multiple different sources before compiling the rules into an `Engine`.
77#[derive(Clone)]
78pub struct FilterSet {
79    debug: bool,
80    pub(crate) network_filters: Vec<NetworkFilter>,
81    pub(crate) cosmetic_filters: Vec<CosmeticFilter>,
82}
83
84/// Collects metadata for the list by reading just until the first non-comment line.
85pub fn read_list_metadata(list: &str) -> FilterListMetadata {
86    let mut metadata = FilterListMetadata::default();
87
88    // uBO only searches within the first 1024 characters; the same optimization can be useful here
89    let mut cutoff = list.len().min(1024);
90
91    while !list.is_char_boundary(cutoff) {
92        cutoff -= 1;
93    }
94
95    // String slice is safe here because `cutoff` is guaranteed to be a character boundary
96    for line in list[0..cutoff].lines() {
97        if line.starts_with('!') {
98            metadata.try_add(line);
99        } else if line.starts_with('[') {
100            continue;
101        } else {
102            break;
103        }
104    }
105
106    metadata
107}
108
109impl Default for FilterSet {
110    /// Equivalent to `FilterSet::new(false)`, or `FilterSet::new(true)` when compiled in test
111    /// configuration.
112    fn default() -> Self {
113        #[cfg(not(test))]
114        let debug = false;
115
116        #[cfg(test)]
117        let debug = true;
118
119        Self::new(debug)
120    }
121}
122
123/// Corresponds to the `expires` field of `FilterListMetadata`.
124#[derive(Debug, PartialEq, Serialize)]
125pub enum ExpiresInterval {
126    Hours(u16),
127    Days(u8),
128}
129
130impl TryFrom<&str> for ExpiresInterval {
131    type Error = ();
132
133    fn try_from(v: &str) -> Result<Self, ()> {
134        const DAYS_MAX: u8 = 14;
135        const HOURS_MAX: u16 = DAYS_MAX as u16 * 24;
136
137        // Extract time amount and unit from str
138        let mut v_split = v.split(' ');
139        let amount = v_split.next().ok_or(())?;
140        let unit = v_split.next().ok_or(())?;
141        // str::parse::<u16> accepts a leading plus sign, but we explicitly forbid it here
142        if amount.starts_with('+') {
143            return Err(());
144        }
145        // Only accept values in the range [1, MAX] for values with a matching unit
146        match unit {
147            "hour" | "hours" => {
148                let amount = amount.parse::<u16>().map_err(|_| ())?;
149                if (1..=HOURS_MAX).contains(&amount) {
150                    return Ok(Self::Hours(amount));
151                }
152            }
153            "day" | "days" => {
154                let amount = amount.parse::<u8>().map_err(|_| ())?;
155                if (1..=DAYS_MAX).contains(&amount) {
156                    return Ok(Self::Days(amount));
157                }
158            }
159            _ => (),
160        }
161        Err(())
162    }
163}
164
165/// Includes information about any "special comments" as described by
166/// <https://help.eyeo.com/adblockplus/how-to-write-filters#special-comments>
167#[derive(Default, Serialize)]
168pub struct FilterListMetadata {
169    /// `! Homepage: http://example.com` - This comment determines which webpage should be linked
170    /// as filter list homepage.
171    pub homepage: Option<String>,
172    /// `! Title: FooList` - This comment sets a fixed title for the filter list. If this comment
173    /// is present, the user is no longer able to change the title.
174    pub title: Option<String>,
175    /// `! Expires: 5 days` - This comment sets the update interval for the filter list. The value
176    /// can be given in days (e.g. 5 days) or hours (e.g. 8 hours). Any value between 1 hour and 14
177    /// days is possible. Note that the update will not necessarily happen after this time
178    /// interval. The actual update time is slightly randomized and depends on some additional
179    /// factors to reduce server load.
180    pub expires: Option<ExpiresInterval>,
181    /// `! Redirect: http://example.com/list.txt` - This comment indicates that the filter list has
182    /// moved to a new download address. Adblock Plus ignores any file content beyond that comment
183    /// and immediately tries downloading from the new address. In case of success, the address of
184    /// the filter list is updated in the settings. This comment is ignored if the new address is
185    /// the same as the current address, meaning that it can be used to enforce the "canonical"
186    /// address of the filter list.
187    pub redirect: Option<String>,
188}
189
190impl FilterListMetadata {
191    /// Attempts to add a line of a filter list to this collection of metadata. Only comment lines
192    /// with valid metadata content will be added. Previously added information will not be
193    /// rewritten.
194    fn try_add(&mut self, line: &str) {
195        if let Some(kv) = line.strip_prefix("! ") {
196            if let Some((key, value)) = kv.split_once(": ") {
197                match key {
198                    "Homepage" if self.homepage.is_none() => {
199                        self.homepage = Some(value.to_string())
200                    }
201                    "Title" if self.title.is_none() => self.title = Some(value.to_string()),
202                    "Expires" if self.expires.is_none() => {
203                        if let Ok(expires) = ExpiresInterval::try_from(value) {
204                            self.expires = Some(expires);
205                        }
206                    }
207                    "Redirect" if self.redirect.is_none() => {
208                        self.redirect = Some(value.to_string())
209                    }
210                    _ => (),
211                }
212            }
213        }
214    }
215}
216
217impl FilterSet {
218    /// Creates a new `FilterSet`. `debug` specifies whether or not to save information about the
219    /// original raw filter rules alongside the more compact internal representation. If enabled,
220    /// this information will be passed to the corresponding `Engine`.
221    pub fn new(debug: bool) -> Self {
222        Self {
223            debug,
224            network_filters: Vec::new(),
225            cosmetic_filters: Vec::new(),
226        }
227    }
228
229    // Used in benchmarks to avoid parsing the rules twice.
230    #[doc(hidden)]
231    pub fn new_with_rules(
232        network_filters: Vec<NetworkFilter>,
233        cosmetic_filters: Vec<CosmeticFilter>,
234        debug: bool,
235    ) -> Self {
236        Self {
237            debug,
238            network_filters,
239            cosmetic_filters,
240        }
241    }
242
243    /// Adds the contents of an entire filter list to this `FilterSet`. Filters that cannot be
244    /// parsed successfully are ignored. Returns any discovered metadata about the list of rules
245    /// added.
246    pub fn add_filter_list(&mut self, filter_list: &str, opts: ParseOptions) -> FilterListMetadata {
247        self.add_filters(filter_list.lines(), opts)
248    }
249
250    /// Adds a collection of filter rules to this `FilterSet`. Filters that cannot be parsed
251    /// successfully are ignored. Returns any discovered metadata about the list of rules added.
252    pub fn add_filters(
253        &mut self,
254        filters: impl IntoIterator<Item = impl AsRef<str>>,
255        opts: ParseOptions,
256    ) -> FilterListMetadata {
257        let (metadata, parsed_network_filters, parsed_cosmetic_filters) =
258            parse_filters_with_metadata(filters, self.debug, opts);
259        self.network_filters.extend(parsed_network_filters);
260        self.cosmetic_filters.extend(parsed_cosmetic_filters);
261        metadata
262    }
263
264    /// Adds the string representation of a single filter rule to this `FilterSet`.
265    pub fn add_filter(&mut self, filter: &str, opts: ParseOptions) -> Result<(), FilterParseError> {
266        let filter_parsed = parse_filter(filter, self.debug, opts);
267        match filter_parsed? {
268            ParsedFilter::Network(filter) => self.network_filters.push(filter),
269            ParsedFilter::Cosmetic(filter) => self.cosmetic_filters.push(filter),
270        }
271        Ok(())
272    }
273
274    /// Consumes this `FilterSet`, returning an equivalent list of content blocking rules and a
275    /// corresponding new list containing the `String` representation of all filters that were
276    /// successfully converted (as `FilterFormat::Standard` rules).
277    ///
278    /// The list of content blocking rules will be properly ordered to ensure correct behavior of
279    /// `ignore-previous-rules`-typed rules.
280    ///
281    /// This function will fail if the `FilterSet` was not created in debug mode.
282    #[cfg(feature = "content-blocking")]
283    #[allow(clippy::result_unit_err)]
284    pub fn into_content_blocking(
285        self,
286    ) -> Result<(Vec<crate::content_blocking::CbRule>, Vec<String>), ()> {
287        use crate::content_blocking;
288        use crate::filters::network::NetworkFilterMaskHelper;
289        use std::collections::HashSet;
290
291        if !self.debug {
292            return Err(());
293        }
294
295        // Store bad filter id to skip them later.
296        let mut bad_filter_ids = HashSet::new();
297        for filter in self.network_filters.iter() {
298            if filter.is_badfilter() {
299                bad_filter_ids.insert(filter.get_id_without_badfilter());
300            }
301        }
302
303        let mut ignore_previous_rules = vec![];
304        let mut other_rules = vec![];
305
306        let mut filters_used = vec![];
307
308        self.network_filters.into_iter().for_each(|filter| {
309            // Don't process bad filter rules or matching bad filter rules.
310            if bad_filter_ids.contains(&filter.get_id()) || filter.is_badfilter() {
311                return;
312            }
313            let original_rule = *filter
314                .raw_line
315                .clone()
316                .expect("All rules should be in debug mode");
317            if let Ok(equivalent) = TryInto::<content_blocking::CbRuleEquivalent>::try_into(filter)
318            {
319                filters_used.push(original_rule);
320                equivalent
321                    .into_iter()
322                    .for_each(|cb_rule| match &cb_rule.action.typ {
323                        content_blocking::CbType::IgnorePreviousRules => {
324                            ignore_previous_rules.push(cb_rule)
325                        }
326                        _ => other_rules.push(cb_rule),
327                    });
328            }
329        });
330
331        let add_fp_document_exception = !filters_used.is_empty();
332
333        self.cosmetic_filters.into_iter().for_each(|filter| {
334            let original_rule = *filter
335                .raw_line
336                .clone()
337                .expect("All rules should be in debug mode");
338            if let Ok(cb_rule) = TryInto::<content_blocking::CbRule>::try_into(filter) {
339                filters_used.push(original_rule);
340                match &cb_rule.action.typ {
341                    content_blocking::CbType::IgnorePreviousRules => {
342                        ignore_previous_rules.push(cb_rule)
343                    }
344                    _ => other_rules.push(cb_rule),
345                }
346            }
347        });
348
349        other_rules.extend(ignore_previous_rules);
350
351        if add_fp_document_exception {
352            other_rules.push(content_blocking::ignore_previous_fp_documents());
353        }
354
355        Ok((other_rules, filters_used))
356    }
357}
358
359/// Denotes the format of a particular list resource, which affects how its rules should be parsed.
360#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
361pub enum FilterFormat {
362    /// Rules should be parsed in ABP/uBO-style format.
363    Standard,
364    /// Each line consists of an IP address (usually 127.0.0.1 or 0.0.0.0), some whitespace, and a
365    /// single hostname. This syntax is normally used directly for HOSTS-based adblockers. These
366    /// rules will be treated equivalently to `"||hostname^"` rules in `Standard` format; the IP
367    /// addresses will not be used.
368    ///
369    /// Note that some sources provide a more raw format, where each line consists of just a
370    /// hostname. This option will also accept that format.
371    ///
372    /// For this option, `!` is accepted as a comment character at the beginning of a line, and `#`
373    /// is accepted as a comment character anywhere in a line.
374    Hosts,
375}
376
377/// Default to parsing lists in `Standard` format.
378impl Default for FilterFormat {
379    fn default() -> Self {
380        Self::Standard
381    }
382}
383
384/// Describes the type of a single filter.
385#[derive(Debug, PartialEq)]
386pub enum FilterType {
387    /// A network filter, used for changing the behavior of network requests
388    Network,
389    /// A network filter, used for changing the behavior of fetched pages
390    Cosmetic,
391    /// Something else that isn't supported
392    NotSupported,
393}
394
395/// Successful result of parsing a single filter rule
396pub enum ParsedFilter {
397    Network(NetworkFilter),
398    Cosmetic(CosmeticFilter),
399}
400
401impl From<NetworkFilter> for ParsedFilter {
402    fn from(v: NetworkFilter) -> Self {
403        ParsedFilter::Network(v)
404    }
405}
406
407impl From<CosmeticFilter> for ParsedFilter {
408    fn from(v: CosmeticFilter) -> Self {
409        ParsedFilter::Cosmetic(v)
410    }
411}
412
413/// Unsuccessful result of parsing a single filter rule.
414#[derive(Debug, Error)]
415pub enum FilterParseError {
416    #[error("network filter error: {0}")]
417    Network(#[source] NetworkFilterError),
418    #[error("cosmetic filter error: {0}")]
419    Cosmetic(#[source] CosmeticFilterError),
420    #[error("unsupported")]
421    Unsupported,
422    #[error("empty")]
423    Empty,
424}
425
426impl From<NetworkFilterError> for FilterParseError {
427    fn from(v: NetworkFilterError) -> Self {
428        FilterParseError::Network(v)
429    }
430}
431
432impl From<CosmeticFilterError> for FilterParseError {
433    fn from(v: CosmeticFilterError) -> Self {
434        FilterParseError::Cosmetic(v)
435    }
436}
437
438/// Parse a single filter rule
439pub fn parse_filter(
440    line: &str,
441    debug: bool,
442    opts: ParseOptions,
443) -> Result<ParsedFilter, FilterParseError> {
444    let filter = line.trim();
445
446    if filter.is_empty() {
447        return Err(FilterParseError::Empty);
448    }
449
450    match opts.format {
451        FilterFormat::Standard => match (detect_filter_type(filter), opts.rule_types) {
452            (FilterType::Network, RuleTypes::All | RuleTypes::NetworkOnly) => {
453                NetworkFilter::parse(filter, debug, opts)
454                    .map(|f| f.into())
455                    .map_err(|e| e.into())
456            }
457            (FilterType::Cosmetic, RuleTypes::All | RuleTypes::CosmeticOnly) => {
458                CosmeticFilter::parse(filter, debug, opts.permissions)
459                    .map(|f| f.into())
460                    .map_err(|e| e.into())
461            }
462            _ => Err(FilterParseError::Unsupported),
463        },
464        FilterFormat::Hosts => {
465            // Hosts-style rules can only ever be network rules
466            if !opts.rule_types.loads_network_rules() {
467                return Err(FilterParseError::Unsupported);
468            }
469            if filter.starts_with('!') {
470                return Err(FilterParseError::Unsupported);
471            }
472            // Discard contents after first `#` character
473            let filter = if let Some(hash_loc) = find_char(b'#', filter.as_bytes()) {
474                let filter = &filter[..hash_loc];
475                let filter = filter.trim();
476
477                if filter.is_empty() {
478                    return Err(FilterParseError::Unsupported);
479                }
480
481                filter
482            } else {
483                filter
484            };
485
486            // Take the last of at most 2 whitespace separated fields
487            let mut filter_parts = filter.split_whitespace();
488            let hostname = match (
489                filter_parts.next(),
490                filter_parts.next(),
491                filter_parts.next(),
492            ) {
493                (None, None, None) => return Err(FilterParseError::Unsupported),
494                (Some(hostname), None, None) => hostname,
495                (Some(_ip), Some(hostname), None) => hostname,
496                (Some(_), Some(_), Some(_)) => return Err(FilterParseError::Unsupported),
497                _ => unreachable!(),
498            };
499
500            // Matches in hosts lists are usually redirected to localhost. For that reason, some
501            // lists include an entry for "localhost", which should be explicitly ignored when
502            // performing request-level adblocking.
503            if hostname == "localhost" {
504                return Err(FilterParseError::Unsupported);
505            }
506
507            NetworkFilter::parse_hosts_style(hostname, debug)
508                .map(|f| f.into())
509                .map_err(|e| e.into())
510        }
511    }
512}
513
514/// Parse an entire list of filters, ignoring any errors
515pub fn parse_filters(
516    list: impl IntoIterator<Item = impl AsRef<str>>,
517    debug: bool,
518    opts: ParseOptions,
519) -> (Vec<NetworkFilter>, Vec<CosmeticFilter>) {
520    let (_metadata, network_filters, cosmetic_filters) =
521        parse_filters_with_metadata(list, debug, opts);
522
523    (network_filters, cosmetic_filters)
524}
525
526/// Parse an entire list of filters, ignoring any errors
527pub fn parse_filters_with_metadata(
528    list: impl IntoIterator<Item = impl AsRef<str>>,
529    debug: bool,
530    opts: ParseOptions,
531) -> (FilterListMetadata, Vec<NetworkFilter>, Vec<CosmeticFilter>) {
532    let mut metadata = FilterListMetadata::default();
533
534    let list_iter = list.into_iter();
535
536    let (network_filters, cosmetic_filters): (Vec<_>, Vec<_>) = list_iter
537        .map(|line| {
538            metadata.try_add(line.as_ref());
539            parse_filter(line.as_ref(), debug, opts)
540        })
541        .filter_map(Result::ok)
542        .partition_map(|filter| match filter {
543            ParsedFilter::Network(f) => Either::Left(f),
544            ParsedFilter::Cosmetic(f) => Either::Right(f),
545        });
546
547    (metadata, network_filters, cosmetic_filters)
548}
549
550/// Given a single line, checks if this would likely be a cosmetic filter, a
551/// network filter or something that is not supported. This check is performed
552/// before calling a more specific parser to create an instance of
553/// `NetworkFilter` or `CosmeticFilter`.
554fn detect_filter_type(filter: &str) -> FilterType {
555    // Ignore comments
556    if filter.len() == 1
557        || filter.starts_with('!')
558        || (filter.starts_with('#') && filter[1..].starts_with(char::is_whitespace))
559        || filter.starts_with("[Adblock")
560    {
561        return FilterType::NotSupported;
562    }
563
564    if filter.starts_with('|') || filter.starts_with("@@|") {
565        return FilterType::Network;
566    }
567
568    // Check if filter is cosmetic
569    if let Some(sharp_index) = find_char(b'#', filter.as_bytes()) {
570        let after_sharp_index = sharp_index + 1;
571
572        // Check the next few bytes for a second `#`
573        // Indexing is safe here because it uses the filter's byte
574        // representation and guards against short strings
575        if find_char(
576            b'#',
577            &filter.as_bytes()[after_sharp_index..(after_sharp_index + 4).min(filter.len())],
578        )
579        .is_some()
580        {
581            return FilterType::Cosmetic;
582        }
583    }
584
585    // Ignore Adguard cosmetics
586    if filter.contains("$$") {
587        return FilterType::NotSupported;
588    }
589
590    // Everything else is a network filter
591    FilterType::Network
592}
593
594#[cfg(test)]
595#[path = "../tests/unit/lists.rs"]
596mod unit_tests;