html_filter/filter/
mod.rs

1//! Module to filter an HTML tree to keep or remove specific nodes, with a set
2//! of rules.
3//!
4//! You can either filter your HTML with [`Html::filter`] or find a specific
5//! node with [`Html::find`].
6//!
7//! For more information on how to define the filtering rules, please refer to
8//! [`Filter`].
9
10extern crate alloc;
11mod element;
12mod node_type;
13pub mod types;
14
15use alloc::borrow::Cow;
16use core::cmp::Ordering;
17
18use node_type::NodeTypeFilter;
19use types::Filter;
20
21use crate::errors::{safe_expect, safe_unreachable};
22use crate::prelude::{Html, Tag};
23
24/// State to follow if the wanted nodes where found at what depth
25///
26/// # Note
27///
28/// We implement the discriminant and specify the representation size in order
29/// to derive [`Ord`] trait.
30#[repr(u8)]
31#[derive(Default, Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
32enum DepthSuccess {
33    /// Wanted node wanting more depth
34    Found(usize) = 1,
35    /// Not wanted node, doesn't respect the filters
36    #[default]
37    None = 2,
38    /// Wanted node with already the wanted depth
39    Success = 0,
40}
41
42impl DepthSuccess {
43    /// Increment the depth, if applicable
44    fn incr(mut self) -> Self {
45        if let Self::Found(depth) = &mut self {
46            *depth = safe_expect!(depth.checked_add(1), "Smaller than required depth");
47        }
48
49        self
50    }
51}
52
53/// Status of the filtering on recursion calls
54#[derive(Default, Debug)]
55struct FilterSuccess {
56    /// Indicates if the filter found a wanted node
57    ///
58    /// Is
59    /// - `None` if no wanted node was found
60    /// - `Some(depth)` if a wanted node was found at depth `depth`. If there
61    ///   are embedded nodes that satisfy the filter, `depth` is the smallest
62    ///   possible.
63    depth: DepthSuccess,
64    /// Result of the filtering
65    html: Html,
66}
67
68impl FilterSuccess {
69    /// Increment the depth, if applicable
70    #[expect(clippy::unnecessary_wraps, reason = "useful for filter method")]
71    fn incr(mut self) -> Option<Self> {
72        self.depth = self.depth.incr();
73        Some(self)
74    }
75
76    /// Creates a [`FilterSuccess`] from an [`Html`]
77    ///
78    /// This is the method to use when the node is considered `found`, i.e.,
79    /// when it was the node the user was looking for.
80    #[expect(clippy::unnecessary_wraps, reason = "useful for filter method")]
81    const fn make_found(html: Html) -> Option<Self> {
82        Some(Self { depth: DepthSuccess::Found(0), html })
83    }
84
85    /// Creates a [`FilterSuccess`] from an [`Html`]
86    ///
87    /// This is the method to use when the node isn't interesting alone, it can
88    /// be if it is in the right scope though.
89    #[expect(clippy::unnecessary_wraps, reason = "useful for filter method")]
90    fn make_none(html: Cow<'_, Html>) -> Option<Self> {
91        Some(Self { depth: DepthSuccess::None, html: html.into_owned() })
92    }
93}
94
95impl Html {
96    /// Method to check if a wanted node is visible
97    ///
98    /// This methods stop checking after a maximum depth, as the current node
99    /// will be discarded if it is deeper in the tree.
100    fn check_depth(&self, max_depth: usize, filter: &Filter) -> Option<usize> {
101        match self {
102            Self::Empty | Self::Text(_) | Self::Comment { .. } | Self::Doctype { .. } => None,
103            Self::Tag { tag, .. } if filter.tag_explicitly_allowed(tag) => Some(0),
104            Self::Tag { .. } | Self::Vec(_) if max_depth == 0 => None,
105            Self::Tag { child, .. } => child
106                .check_depth(
107                    #[expect(clippy::arithmetic_side_effects, reason = "non-0")]
108                    {
109                        max_depth - 1
110                    },
111                    filter,
112                )
113                .map(
114                    #[expect(clippy::arithmetic_side_effects, reason = "< initial max_depth")]
115                    |depth| depth + 1,
116                ),
117            Self::Vec(vec) => vec
118                .iter()
119                .try_fold(Some(usize::MAX), |acc, child| {
120                    if acc == Some(0) {
121                        Err(())
122                    } else {
123                        Ok(child.check_depth(max_depth, filter))
124                    }
125                })
126                .unwrap_or(Some(0)),
127        }
128    }
129
130    /// Filters html based on a defined filter.
131    ///
132    /// See [`Filter`] to learn how to create filters.
133    ///
134    /// Filters allow you to select the portions of the html code you want to
135    /// keep or remove.
136    ///
137    /// # Returns
138    ///
139    /// The html tree obtains by keeping only the nodes that fulfills the
140    /// filter.
141    #[must_use]
142    pub fn filter(self, filter: &Filter) -> Self {
143        filter_aux(Cow::Owned(self), filter, false).html
144    }
145
146    /// Finds an html node based on a defined filter.
147    ///
148    /// See [`Filter`] to know how to define a filter.
149    ///
150    /// Filters allow you to select the portions of the html code you want to
151    /// keep or remove.
152    ///
153    /// # Returns
154    ///
155    /// The first node that fulfills the filter.
156    #[must_use]
157    pub fn find(self, filter: &Filter) -> Self {
158        self.filter(filter).into_first()
159    }
160
161    /// Keeps only the first element of a filtered output
162    fn into_first(self) -> Self {
163        if let Self::Vec(vec) = self {
164            for elt in vec {
165                let res = elt.into_first();
166                if !res.is_empty() {
167                    return res;
168                }
169            }
170            safe_unreachable("Filtering removes empty nodes in vec.")
171        } else {
172            self
173        }
174    }
175
176    /// Filters html based on a defined filter.
177    ///
178    /// Equivalent of [`Html::filter`] when data is not owned.
179    #[must_use]
180    pub fn to_filtered(&self, filter: &Filter) -> Self {
181        filter_aux(Cow::Borrowed(self), filter, false).html
182    }
183
184    /// Finds an html node based on a defined filter.
185    ///
186    /// Equivalent of [`Html::find`] when data is not owned.
187    //TODO: data except first is cloned
188    #[must_use]
189    pub fn to_found(&self, filter: &Filter) -> Self {
190        self.to_filtered(filter).into_first()
191    }
192}
193
194/// Wrapper for [`Html::filter`]
195///
196/// Refer to [`Html::filter`] for documentation.
197///
198/// This methods takes an additional `clean` boolean to indicate when a tag
199/// returns the child. In that case, the texts must disappear if present at
200/// root.
201///
202/// This methods returns a wrapper of the final html in a [`FilterSuccess`]
203/// to follow the current depth of the last found node. See
204/// [`FilterSuccess`] for more information.
205#[allow(clippy::allow_attributes, reason = "expect is buggy")]
206#[allow(
207    clippy::enum_glob_use,
208    reason = "heavy syntax and Html is the main struct"
209)]
210fn filter_aux(cow_html: Cow<'_, Html>, filter: &Filter, found: bool) -> FilterSuccess {
211    use Html::*;
212    match cow_html {
213        Cow::Borrowed(Comment(_)) | Cow::Owned(Comment(_))
214            if found || !filter.comment_explicitly_allowed() =>
215            None,
216        Cow::Borrowed(Doctype { .. }) | Cow::Owned(Doctype { .. })
217            if found || !filter.doctype_allowed() =>
218            None,
219        Cow::Borrowed(Doctype { .. } | Comment(_)) | Cow::Owned(Doctype { .. } | Comment(_)) =>
220            FilterSuccess::make_none(cow_html),
221        Cow::Borrowed(Text(_) | Empty) | Cow::Owned(Text(_) | Empty) => None,
222        Cow::Borrowed(Tag { tag, child }) =>
223            filter_aux_tag(Cow::Borrowed(&**child), Cow::Borrowed(tag), filter, found),
224        Cow::Owned(Tag { tag, child }) =>
225            filter_aux_tag(Cow::Owned(*child), Cow::Owned(tag), filter, found),
226        Cow::Borrowed(Vec(vec)) => filter_aux_vec(Cow::Borrowed(vec), filter),
227        Cow::Owned(Vec(vec)) => filter_aux_vec(Cow::Owned(vec), filter),
228    }
229    .unwrap_or_default()
230}
231
232/// Auxiliary method for [`filter_aux`] on [`Html::Tag`]
233#[expect(
234    clippy::arithmetic_side_effects,
235    reason = "incr depth when smaller than filter_depth"
236)]
237fn filter_aux_tag(
238    child: Cow<'_, Html>,
239    tag: Cow<'_, Tag>,
240    filter: &Filter,
241    found: bool,
242) -> Option<FilterSuccess> {
243    if filter.tag_allowed(tag.as_ref()) {
244        FilterSuccess::make_found(Html::Tag {
245            tag: tag.into_owned(),
246            child: Box::new(filter_light(child, filter)),
247        })
248    } else if filter.as_depth() == 0 {
249        filter_aux(child, filter, found).incr()
250    } else {
251        let rec = filter_aux(child, filter, found);
252        match rec.depth {
253            DepthSuccess::None => None,
254            DepthSuccess::Success => Some(rec),
255            DepthSuccess::Found(depth) => match depth.cmp(&filter.as_depth()) {
256                Ordering::Less => Some(FilterSuccess {
257                    depth: DepthSuccess::Found(depth + 1),
258                    html: Html::Tag { tag: tag.into_owned(), child: Box::new(rec.html) },
259                }),
260                Ordering::Equal | Ordering::Greater =>
261                    Some(FilterSuccess { depth: DepthSuccess::Success, html: rec.html }),
262            },
263        }
264    }
265}
266
267/// Auxiliary method for [`filter_aux`] on [`Html::Vec`]
268#[expect(
269    clippy::arithmetic_side_effects,
270    reason = "incr depth when smaller than filter_depth"
271)]
272fn filter_aux_vec(vec: Cow<'_, Box<[Html]>>, filter: &Filter) -> Option<FilterSuccess> {
273    match vec
274        .as_ref()
275        .iter()
276        .filter_map(|child| child.check_depth(filter.as_depth() + 1, filter))
277        .min()
278    {
279        Some(depth) if depth < filter.as_depth() => Some(FilterSuccess {
280            depth: DepthSuccess::Found(depth),
281            html: Html::Vec(
282                vec.iter()
283                    .map(|child| filter_light(Cow::Borrowed(child), filter))
284                    .collect(),
285            ),
286        }),
287        Some(_) => Some(FilterSuccess {
288            depth: DepthSuccess::Success,
289            html: Html::Vec(into_iter_filter_map_collect(vec, |child| {
290                let rec = filter_aux(child, filter, true);
291                if rec.html.is_empty() {
292                    None
293                } else {
294                    Some(rec.html)
295                }
296            })),
297        }),
298        None => {
299            let mut filtered: Vec<FilterSuccess> = into_iter_filter_map_collect(vec, |child| {
300                let rec = filter_aux(child, filter, false);
301                if rec.html.is_empty() { None } else { Some(rec) }
302            });
303            if filtered.len() <= 1 {
304                filtered.pop()
305            } else {
306                filtered
307                    .iter()
308                    .map(|child| child.depth)
309                    .min()
310                    .map(|depth| FilterSuccess {
311                        depth,
312                        html: Html::Vec(filtered.into_iter().map(|child| child.html).collect()),
313                    })
314            }
315        }
316    }
317}
318
319/// Light filter without complicated logic, just filtering on types.
320///
321/// This method does take into account the [`Filter::tag_name`],
322///   [`Filter::attribute_name`] and [`Filter::attribute_value`] methods,
323/// only the types of [`NodeTypeFilter`].
324///
325/// The return type is [`Html`] and not [`Cow`] has it is only called on
326/// successes.
327#[allow(clippy::allow_attributes, reason = "expect is buggy")]
328#[allow(
329    clippy::enum_glob_use,
330    reason = "heavy syntax and Html is the main struct"
331)]
332fn filter_light(cow_html: Cow<'_, Html>, filter: &Filter) -> Html {
333    use Html::*;
334    match cow_html {
335        Cow::Borrowed(Text(_)) | Cow::Owned(Text(_)) if filter.text_allowed() =>
336            cow_html.into_owned(),
337        Cow::Borrowed(Comment(_)) | Cow::Owned(Comment(_)) if filter.comment_allowed() =>
338            cow_html.into_owned(),
339        Cow::Borrowed(Doctype { .. }) | Cow::Owned(Doctype { .. }) if filter.doctype_allowed() =>
340            cow_html.into_owned(),
341        Cow::Borrowed(Tag { tag, .. }) if filter.tag_explicitly_blacklisted(tag) => Html::Empty,
342        Cow::Owned(Tag { tag, .. }) if filter.tag_explicitly_blacklisted(&tag) => Html::Empty,
343        Cow::Borrowed(Tag { tag, child }) => Tag {
344            tag: tag.to_owned(),
345            child: Box::new(filter_light(Cow::Borrowed(&**child), filter)),
346        },
347        Cow::Owned(Tag { tag, child }) =>
348            Tag { tag, child: Box::new(filter_light(Cow::Owned(*child), filter)) },
349        Cow::Borrowed(Vec(vec)) => Html::Vec(
350            vec.into_iter()
351                .map(|child| filter_light(Cow::Borrowed(child), filter))
352                .collect(),
353        ),
354        Cow::Owned(Vec(vec)) => Html::Vec(
355            vec.into_iter()
356                .map(|child| filter_light(Cow::Owned(child), filter))
357                .collect(),
358        ),
359        Cow::Borrowed(Empty | Text(_) | Comment { .. } | Doctype { .. })
360        | Cow::Owned(Empty | Text(_) | Comment { .. } | Doctype { .. }) => Html::Empty,
361    }
362}
363
364/// Method to apply [`Iterator::filter_map`] on an iterator inside a Cow,
365/// without losing the Cow.
366pub fn into_iter_filter_map_collect<T, U, V, F>(cow: Cow<'_, Box<[T]>>, map: F) -> V
367where
368    T: Clone,
369    V: FromIterator<U>,
370    F: Fn(Cow<'_, T>) -> Option<U>,
371{
372    match cow {
373        Cow::Borrowed(borrowed) => borrowed
374            .into_iter()
375            .filter_map(|elt| map(Cow::Borrowed(elt)))
376            .collect(),
377        Cow::Owned(owned) => owned
378            .into_iter()
379            .filter_map(|elt| map(Cow::Owned(elt)))
380            .collect(),
381    }
382}