Skip to main content

html_filter/filter/
mod.rs

1//! Module to filter an HTML tree to keep or remove specific nodes, with a set
2//! of rules.
3//!
4//! You can either filter your HTML with [`Html::filter`] or find a specific
5//! node with [`Html::find`].
6//!
7//! For more information on how to define the filtering rules, please refer to
8//! [`Filter`].
9
10extern crate alloc;
11mod api;
12mod element;
13mod node_type;
14pub mod types;
15
16use alloc::borrow::Cow;
17use core::cmp::Ordering;
18use core::mem::take;
19
20use node_type::NodeTypeFilter;
21use types::Filter;
22
23use crate::errors::{safe_expect, safe_unreachable};
24use crate::{Html, Tag};
25
26/// State to follow if the wanted nodes where found at what depth
27///
28/// # Note
29///
30/// We implement the discriminant and specify the representation size in order
31/// to derive [`Ord`] trait.
32#[repr(u8)]
33#[derive(Default, Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
34enum DepthSuccess {
35    /// Wanted node wanting more depth
36    Found(usize) = 1,
37    /// Not wanted node, doesn't respect the filters
38    #[default]
39    None = 2,
40    /// Wanted node with already the wanted depth
41    Success = 0,
42}
43
44impl DepthSuccess {
45    /// Increment the depth, if applicable
46    fn incr(mut self) -> Self {
47        if let Self::Found(depth) = &mut self {
48            *depth = safe_expect!(depth.checked_add(1), "Smaller than required depth");
49        }
50
51        self
52    }
53}
54
55/// Status of the filtering on recursion calls
56#[derive(Default, Debug)]
57struct FilterSuccess {
58    /// Indicates if the filter found a wanted node
59    ///
60    /// Is
61    /// - `None` if no wanted node was found
62    /// - `Some(depth)` if a wanted node was found at depth `depth`. If there
63    ///   are embedded nodes that satisfy the filter, `depth` is the smallest
64    ///   possible.
65    depth: DepthSuccess,
66    /// Result of the filtering
67    html: Html,
68}
69
70impl FilterSuccess {
71    /// Increment the depth, if applicable
72    #[expect(clippy::unnecessary_wraps, reason = "useful for filter method")]
73    fn incr(mut self) -> Option<Self> {
74        self.depth = self.depth.incr();
75        Some(self)
76    }
77
78    /// Creates a [`FilterSuccess`] from an [`Html`]
79    ///
80    /// This is the method to use when the node is considered `found`, i.e.,
81    /// when it was the node the user was looking for.
82    #[expect(clippy::unnecessary_wraps, reason = "useful for filter method")]
83    const fn make_found(html: Html) -> Option<Self> {
84        Some(Self { depth: DepthSuccess::Found(0), html })
85    }
86
87    /// Creates a [`FilterSuccess`] from an [`Html`]
88    ///
89    /// This is the method to use when the node isn't interesting alone, it can
90    /// be if it is in the right scope though.
91    #[expect(clippy::unnecessary_wraps, reason = "useful for filter method")]
92    fn make_none(html: Cow<'_, Html>) -> Option<Self> {
93        Some(Self { depth: DepthSuccess::None, html: html.into_owned() })
94    }
95}
96
97impl Html {
98    /// Method to check if a wanted node is visible
99    ///
100    /// This methods stop checking after a maximum depth, as the current node
101    /// will be discarded if it is deeper in the tree.
102    fn check_depth(&self, max_depth: usize, filter: &Filter) -> Option<usize> {
103        match self {
104            Self::Empty | Self::Text(_) | Self::Comment { .. } | Self::Doctype { .. } => None,
105            Self::Tag { tag, .. } if filter.tag_explicitly_allowed(tag) => Some(0),
106            Self::Tag { .. } | Self::Vec(_) if max_depth == 0 => None,
107            Self::Tag { child, .. } => child
108                .check_depth(
109                    #[expect(clippy::arithmetic_side_effects, reason = "non-0")]
110                    {
111                        max_depth - 1
112                    },
113                    filter,
114                )
115                .map(
116                    #[expect(clippy::arithmetic_side_effects, reason = "< initial max_depth")]
117                    |depth| depth + 1,
118                ),
119            Self::Vec(vec) => vec
120                .iter()
121                .try_fold(Some(usize::MAX), |acc, child| {
122                    if acc == Some(0) { Err(()) } else { Ok(child.check_depth(max_depth, filter)) }
123                })
124                .unwrap_or(Some(0)),
125        }
126    }
127
128    /// Filters html based on a defined filter.
129    ///
130    /// See [`Filter`] to learn how to create filters.
131    ///
132    /// Filters allow you to select the portions of the html code you want to
133    /// keep or remove.
134    ///
135    /// # Returns
136    ///
137    /// The html tree obtains by keeping only the nodes that fulfil the
138    /// filter.
139    #[must_use]
140    pub fn filter(self, filter: &Filter) -> Self {
141        filter_aux(Cow::Owned(self), filter, false).html
142    }
143
144    /// Finds an html node based on a defined filter.
145    ///
146    /// See [`Filter`] to know how to define a filter.
147    ///
148    /// Filters allow you to select the portions of the html code you want to
149    /// keep or remove.
150    ///
151    /// # Returns
152    ///
153    /// The first node that fulfils the filter.
154    #[must_use]
155    pub fn find(self, filter: &Filter) -> Self {
156        self.filter(filter).into_first()
157    }
158
159    /// Keeps only the first element of a filtered output
160    fn into_first(self) -> Self {
161        if let Self::Vec(vec) = self {
162            for elt in vec {
163                let res = elt.into_first();
164                if !res.is_empty() {
165                    return res;
166                }
167            }
168            safe_unreachable!("Filtering removes empty nodes in vec.")
169        } else {
170            self
171        }
172    }
173
174    /// Filters html based on a defined filter.
175    ///
176    /// Equivalent of [`Html::filter`] when data is not owned.
177    #[must_use]
178    pub fn to_filtered(&self, filter: &Filter) -> Self {
179        filter_aux(Cow::Borrowed(self), filter, false).html
180    }
181
182    /// Finds an html node based on a defined filter.
183    ///
184    /// Equivalent of [`Html::find`] when data is not owned.
185    // TODO: data except first is cloned
186    #[must_use]
187    pub fn to_found(&self, filter: &Filter) -> Self {
188        self.to_filtered(filter).into_first()
189    }
190}
191
192/// Wrapper for [`Html::filter`]
193///
194/// Refer to [`Html::filter`] for documentation.
195///
196/// This methods takes an additional `clean` boolean to indicate when a tag
197/// returns the child. In that case, the texts must disappear if present at
198/// root.
199///
200/// This methods returns a wrapper of the final html in a [`FilterSuccess`]
201/// to follow the current depth of the last found node. See
202/// [`FilterSuccess`] for more information.
203#[allow(clippy::allow_attributes, reason = "expect is buggy")]
204#[allow(clippy::enum_glob_use, reason = "heavy syntax and Html is the main struct")]
205fn filter_aux(cow_html: Cow<'_, Html>, filter: &Filter, found: bool) -> FilterSuccess {
206    use Html::*;
207    match cow_html {
208        Cow::Borrowed(Comment(_)) | Cow::Owned(Comment(_))
209            if !filter.comment_explicitly_allowed() =>
210            None,
211        Cow::Borrowed(Doctype { .. }) | Cow::Owned(Doctype { .. }) if !filter.doctype_allowed() =>
212            None,
213        Cow::Borrowed(Doctype { .. } | Comment(_)) | Cow::Owned(Doctype { .. } | Comment(_)) =>
214            FilterSuccess::make_none(cow_html),
215        Cow::Borrowed(Text(text)) if filter.text_explicitly_allowed() && filter.should_trim() =>
216            FilterSuccess::make_none(Cow::Owned(Html::trim_text(text))),
217        Cow::Owned(Text(text)) if filter.text_explicitly_allowed() && filter.should_trim() =>
218            FilterSuccess::make_none(Cow::Owned(Html::trim_text(&text))),
219        Cow::Borrowed(Text(_)) | Cow::Owned(Text(_)) if filter.text_explicitly_allowed() =>
220            FilterSuccess::make_none(cow_html),
221        Cow::Borrowed(Text(_) | Empty) | Cow::Owned(Text(_) | Empty) => None,
222        // incorrect
223        Cow::Borrowed(Tag { tag, child }) =>
224            filter_aux_tag(Cow::Borrowed(&**child), Cow::Borrowed(tag), filter, found),
225        Cow::Owned(Tag { tag, child }) =>
226            filter_aux_tag(Cow::Owned(*child), Cow::Owned(tag), filter, found),
227        Cow::Borrowed(Vec(vec)) => filter_aux_vec(Cow::Borrowed(vec), filter),
228        Cow::Owned(Vec(vec)) => filter_aux_vec(Cow::Owned(vec), filter),
229    }
230    .unwrap_or_default()
231}
232
233/// Auxiliary method for [`filter_aux`] on [`Html::Tag`]
234#[expect(clippy::arithmetic_side_effects, reason = "incr depth when smaller than filter_depth")]
235fn filter_aux_tag(
236    child: Cow<'_, Html>,
237    tag: Cow<'_, Tag>,
238    filter: &Filter,
239    found: bool,
240) -> Option<FilterSuccess> {
241    if filter.tag_allowed(tag.as_ref()) {
242        FilterSuccess::make_found(Html::Tag {
243            tag: tag.into_owned(),
244            child: Box::new(filter_light(child, filter)),
245        })
246    } else if filter.as_depth() == 0 {
247        filter_aux(child, filter, found).incr()
248    } else {
249        let rec = filter_aux(child, filter, found);
250        match rec.depth {
251            DepthSuccess::None => None,
252            DepthSuccess::Success => Some(rec),
253            DepthSuccess::Found(depth) => match depth.cmp(&filter.as_depth()) {
254                Ordering::Less => Some(FilterSuccess {
255                    depth: DepthSuccess::Found(depth + 1),
256                    html: Html::Tag { tag: tag.into_owned(), child: Box::new(rec.html) },
257                }),
258                Ordering::Equal | Ordering::Greater =>
259                    Some(FilterSuccess { depth: DepthSuccess::Success, html: rec.html }),
260            },
261        }
262    }
263}
264
265/// Auxiliary method for [`filter_aux`] on [`Html::Vec`]
266#[expect(clippy::arithmetic_side_effects, reason = "incr depth when smaller than filter_depth")]
267fn filter_aux_vec(vec: Cow<'_, Box<[Html]>>, filter: &Filter) -> Option<FilterSuccess> {
268    match vec
269        .as_ref()
270        .iter()
271        .filter_map(|child| child.check_depth(filter.as_depth() + 1, filter))
272        .min()
273    {
274        Some(depth) if depth < filter.as_depth() => Some(FilterSuccess {
275            depth: DepthSuccess::Found(depth),
276            html: unwrap_vec(
277                vec.iter()
278                    .map(|child| filter_light(Cow::Borrowed(child), filter))
279                    .filter(|child| !child.is_empty())
280                    .collect(),
281                filter.as_collapse(),
282            ),
283        }),
284        Some(_) => Some(FilterSuccess {
285            depth: DepthSuccess::Success,
286            html: unwrap_vec(
287                into_iter_filter_map_collect(vec, |child| {
288                    let rec = filter_aux(child, filter, true).html;
289                    if rec.is_empty() { None } else { Some(rec) }
290                }),
291                filter.as_collapse(),
292            ),
293        }),
294        None => {
295            let mut filtered: Vec<FilterSuccess> = into_iter_filter_map_collect(vec, |child| {
296                let rec = filter_aux(child, filter, false);
297                if rec.html.is_empty() { None } else { Some(rec) }
298            });
299            if filtered.len() <= 1 {
300                filtered.pop()
301            } else {
302                filtered.iter().map(|child| child.depth).min().map(|depth| FilterSuccess {
303                    depth,
304                    html: unwrap_vec(
305                        filtered.into_iter().map(|child| child.html).collect(),
306                        filter.as_collapse(),
307                    ),
308                })
309            }
310        }
311    }
312}
313
314/// Light filter without complicated logic, just filtering on types.
315///
316/// This method does take into account the [`Filter::tag_name`],
317///   [`Filter::attribute_name`] and [`Filter::attribute_value`] methods,
318/// only the types of [`NodeTypeFilter`].
319///
320/// The return type is [`Html`] and not [`Cow`] has it is only called on
321/// successes.
322#[allow(clippy::allow_attributes, reason = "expect is buggy")]
323#[allow(clippy::enum_glob_use, reason = "heavy syntax and Html is the main struct")]
324fn filter_light(cow_html: Cow<'_, Html>, filter: &Filter) -> Html {
325    use Html::*;
326    #[allow(clippy::ref_patterns, reason = "!")]
327    match cow_html {
328        Cow::Borrowed(Text(txt)) if filter.text_allowed() && filter.should_trim() =>
329            Html::trim_text(txt),
330        Cow::Owned(Text(txt)) if filter.text_allowed() && filter.should_trim() =>
331            Html::trim_text(&txt),
332        Cow::Owned(Text(_)) | Cow::Borrowed(Text(_)) if filter.text_allowed() =>
333            cow_html.into_owned(),
334        Cow::Borrowed(Comment(_)) | Cow::Owned(Comment(_)) if filter.comment_allowed() =>
335            cow_html.into_owned(),
336        Cow::Borrowed(Doctype { .. }) | Cow::Owned(Doctype { .. }) if filter.doctype_allowed() =>
337            cow_html.into_owned(),
338        Cow::Borrowed(Tag { tag, .. }) if filter.tag_explicitly_blacklisted(tag) => Html::Empty,
339        Cow::Owned(Tag { tag, .. }) if filter.tag_explicitly_blacklisted(&tag) => Html::Empty,
340        Cow::Borrowed(Tag { tag, child }) => Tag {
341            tag: tag.to_owned(),
342            child: Box::new(filter_light(Cow::Borrowed(&**child), filter)),
343        },
344        Cow::Owned(Tag { tag, child }) =>
345            Tag { tag, child: Box::new(filter_light(Cow::Owned(*child), filter)) },
346        Cow::Borrowed(Vec(vec)) => unwrap_vec(
347            vec.iter()
348                .map(|child| filter_light(Cow::Borrowed(child), filter))
349                .filter(|html| !html.is_empty())
350                .collect(),
351            filter.as_collapse(),
352        ),
353        Cow::Owned(Vec(vec)) => unwrap_vec(
354            vec.into_iter()
355                .map(|child| filter_light(Cow::Owned(child), filter))
356                .filter(|html| !html.is_empty())
357                .collect(),
358            filter.as_collapse(),
359        ),
360        Cow::Borrowed(Empty | Text(_) | Comment { .. } | Doctype { .. })
361        | Cow::Owned(Empty | Text(_) | Comment { .. } | Doctype { .. }) => Html::Empty,
362    }
363}
364
365/// Unwrap a [`Vec<Html>`] to not have vecs of 0 and 1 element.
366fn unwrap_vec(vec: Vec<Html>, collapse: bool) -> Html {
367    let mut res = if collapse {
368        let mut previous = String::new();
369        let mut res = Vec::with_capacity(vec.len());
370        for this in vec {
371            if let Html::Text(text) = this {
372                previous.push_str(&text);
373            } else {
374                if !previous.is_empty() {
375                    res.push(Html::Text(take(&mut previous)));
376                }
377                res.push(this);
378            }
379        }
380        if !previous.is_empty() {
381            res.push(Html::Text(take(&mut previous)));
382        }
383        res
384    } else {
385        vec
386    };
387    if res.len() <= 1 {
388        res.first_mut().map(take).unwrap_or_default()
389    } else {
390        Html::Vec(res.into_boxed_slice())
391    }
392}
393
394/// Method to apply [`Iterator::filter_map`] on an iterator inside a Cow,
395/// without losing the Cow.
396fn into_iter_filter_map_collect<T, U, V, F>(cow: Cow<'_, Box<[T]>>, map: F) -> V
397where
398    T: Clone,
399    V: FromIterator<U>,
400    F: Fn(Cow<'_, T>) -> Option<U>,
401{
402    match cow {
403        Cow::Borrowed(borrowed) =>
404            borrowed.into_iter().filter_map(|elt| map(Cow::Borrowed(elt))).collect(),
405        Cow::Owned(owned) => owned.into_iter().filter_map(|elt| map(Cow::Owned(elt))).collect(),
406    }
407}