1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
//! Mutating visitor support for `Document`.

use std::cell::RefCell;

use log::debug;

use crate::chars::{is_all_ctrl_ws, replace_chars};
use crate::dom::{
    html::{t, TAG_META},
    Document, Element, NodeData, NodeId, NodeRef, StrTendril
};

/// An instruction returned by the `Fn` closure used by [`Document::filter`].
#[derive(Debug, PartialEq, Eq)]
pub enum Action {
    /// Continue filtering, without further changes to this `Node`.
    Continue,

    /// Detach this `Node`, and any children, from the tree.
    Detach,

    /// Replace this `Node` with its children. Equivalent to `Detach` if
    /// returned for a `Node` with no children.
    Fold,
}

/// Mutating filter methods.
impl Document {
    /// Perform a depth-first (children before parent nodes) walk of the
    /// entire `Document`, including synthetic document node, applying the
    /// provided function.
    ///
    /// See [`Document::filter_at`] for additional details.
    pub fn filter<F>(&mut self, mut f: F)
        where F: Fn(NodeRef<'_>, &mut NodeData) -> Action
    {
        self.filter_at_ref(Document::DOCUMENT_NODE_ID, true, &mut f);
    }

    /// Perform a breadth-first (children after parent nodes) walk of the
    /// entire `Document`, including synthetic document node, applying the
    /// provided function.
    ///
    /// See [`Document::filter_at`] for additional details.
    pub fn filter_breadth<F>(&mut self, mut f: F)
        where F: Fn(NodeRef<'_>, &mut NodeData) -> Action
    {
        self.filter_at_ref(Document::DOCUMENT_NODE_ID, false, &mut f);
    }

    /// Perform a depth-first (children before parent nodes) walk from the
    /// specified node ID, applying the provided function.
    ///
    /// ### Traversal order
    ///
    /// This variant performs a depth-first (children before parent nodes) tree
    /// walk, but there is also [`Document::filter_at_breadth`], a
    /// breadth-first (parent before children) variant. Filter functions such
    /// as [`detach_banned_elements`] may perform better breadth-first (but are
    /// _compatible_ with both traversal orders). Other functions such as
    /// [`fold_empty_inline`] and [`text_normalize`] will only yield complete
    /// results when run depth-first. See individual functions for
    /// compatibility with traversal orders.
    ///
    /// ### Filter functions
    ///
    /// The `f` parameter can be a closure or free-function in the form:
    ///
    /// ```norun
    /// fn a_filter_fn(pos: NodeRef<'_>, data: &mut NodeData) -> Action;
    /// ```
    ///
    /// Where `data` provides read-write access to the the `NodeData` of the
    /// current node being visited, and `pos` gives a read-only view to the
    /// remainder of the `Document`, e.g. parent, children, and siblings of the
    /// current node. Note that to avoid aliasing issues, the `NodeData` is
    /// actually moved out of the `Document` and replaced with a
    /// `NodeData::Hole` value which could be observed via `pos`. The
    /// potentially modified `NodeData` is moved back to the `Document` if the
    /// function returns `Action::Continue`. The function may also modify the
    /// `Document` by returning other [`Action`] values.
    ///
    /// For convenience and efficiency, multiple filter functions can be
    /// combined via the [`chain_filters`] macro and run in one pass.
    ///
    /// Note that to free up all memory associated with filtered `Node`s that
    /// have been detached, use [`Document::deep_clone`] and drop the original
    /// `Document.`.
    pub fn filter_at<F>(&mut self, id: NodeId, mut f: F)
        where F: Fn(NodeRef<'_>, &mut NodeData) -> Action
    {
        self.filter_at_ref(id, true, &mut f);
    }

    /// Perform a breadth-first (children after parent nodes) walk from the
    /// specified node ID, applying the provided function.
    ///
    /// See [`Document::filter_at`] for additional details.
    pub fn filter_at_breadth<F>(&mut self, id: NodeId, mut f: F)
        where F: Fn(NodeRef<'_>, &mut NodeData) -> Action
    {
        self.filter_at_ref(id, false, &mut f);
    }

    fn filter_at_ref<F>(&mut self, id: NodeId, depth_first: bool, f: &mut F)
        -> Action
        where F: Fn(NodeRef<'_>, &mut NodeData) -> Action
    {
        let res = if depth_first {
            self.walk_depth(id, f)
        } else {
            self.walk_breadth(id, f)
        };

        match res {
            Action::Continue => {},
            Action::Fold => {
                self.fold(id);
            }
            Action::Detach => {
                self.detach(id);
            }
        }
        res
    }

    fn walk_depth<F>(&mut self, id: NodeId, f: &mut F) -> Action
        where F: Fn(NodeRef<'_>, &mut NodeData) -> Action
    {
        // Children first, recursively
        let mut next_child = self[id].first_child;
        while let Some(child) = next_child {
            // set before possible loss by filter action
            next_child = self[child].next_sibling;
            self.filter_at_ref(child, true, f);
        }

        self.filter_node(id, f)
    }

    fn walk_breadth<F>(&mut self, id: NodeId, f: &mut F) -> Action
        where F: Fn(NodeRef<'_>, &mut NodeData) -> Action
    {
        let res = self.filter_node(id, f);
        if res != Action::Continue {
            return res;
        }

        // Children after, recursively
        let mut next_child = self[id].first_child;
        while let Some(child) = next_child {
            // set before possible loss by filter action
            next_child = self[child].next_sibling;
            let prev = self[child].prev_sibling;
            let parent = self[child].parent;

            let res = self.filter_at_ref(child, false, f);

            if res == Action::Fold {
                if let Some(p) = prev {
                    next_child = self[p].next_sibling;
                } else if let Some(p) = parent {
                    next_child = self[p].first_child;
                }
            }
        }

        Action::Continue
    }

    fn filter_node<F>(&mut self, id: NodeId, f: &mut F) -> Action
        where F: Fn(NodeRef<'_>, &mut NodeData) -> Action
    {
        // We need to replace node.data with a placeholder (Hole) to appease
        // the borrow checker. Otherwise there would be an aliasing problem
        // where the Document (&self) reference could see the same NodeData
        // passed as &mut.
        let mut ndata = std::mem::replace(&mut self[id].data, NodeData::Hole);
        let res = f(NodeRef::new(self, id), &mut ndata);
        // We only need to reset the potentially mutated node.data if the
        // action is to continue, as all other cases result in the node
        // being detached.
        if res == Action::Continue {
            let node = &mut self[id];
            match ndata {
                NodeData::Document | NodeData::Elem(_) => {}
                NodeData::Hole => {
                    debug_assert!(false, "Filter changed to {:?}", ndata);
                }
                _ => {
                    debug_assert!(
                        node.first_child.is_none() && node.last_child.is_none(),
                        "Filter changed node {:?} with children to {:?}",
                        id, ndata);
                }
            }
            node.data = ndata;
        }
        res
    }
}

/// Compose a new filter closure, by chaining a list of 1 to many closures or
/// function paths. Each is executed in order, while the returned action remains
/// `Action::Continue`, or otherwise terminated early.
#[macro_export]
macro_rules! chain_filters {
    ($solo:expr $(,)?) => (
        |pos: $crate::NodeRef<'_>, data: &mut $crate::NodeData| {
            $solo(pos, data)
        }
    );
    ($first:expr $(, $subs:expr)+ $(,)?) => (
        |pos: $crate::NodeRef<'_>, data: &mut $crate::NodeData| {
            let mut action: $crate::filter::Action = $first(pos, data);
        $(
            if action == $crate::filter::Action::Continue {
                action = $subs(pos, data);
            }
        )*
            action
        }
    );
}

/// Detach known banned elements
/// ([`TagMeta::is_banned`](crate::html::TagMeta::is_banned)) and any elements
/// which are unknown.
///
/// Compatible with depth or breadth-first filtering, but more efficiently
/// executed breadth-first.
pub fn detach_banned_elements(_p: NodeRef<'_>, data: &mut NodeData) -> Action {
    if let Some(ref mut elm) = data.as_element_mut() {
        if let Some(tmeta) = TAG_META.get(&elm.name.local) {
            if tmeta.is_banned() {
                return Action::Detach;
            }
        } else {
            debug!("Detaching unknown element tag {}", &elm.name.local);
            return Action::Detach;
        }
    }
    Action::Continue
}

/// Fold meaningless inline elements, which are empty or contain only logical
/// whitespace.
///
/// Logical whitespace is defined as all Unicode whitespace or control chars in
/// child text, or the `<br>` element. Non-text oriented inline elements like
/// `<img>` and `<video>` and other multi-media are excluded from
/// consideration.
///
/// Should be run depth-first for complete filtering.
pub fn fold_empty_inline(pos: NodeRef<'_>, data: &mut NodeData) -> Action {
    if  is_inline(data) &&
        !is_multi_media(data) &&
        pos.children().all(is_logical_ws)
    {
        Action::Fold
    } else {
        Action::Continue
    }
}

/// Detach any comment nodes.
///
/// Compatible with depth or breadth-first filtering.
pub fn detach_comments(_p: NodeRef<'_>, data: &mut NodeData) -> Action {
    if let NodeData::Comment(_) = data {
        Action::Detach
    } else {
        Action::Continue
    }
}

/// Detach any processing instruction nodes.
///
/// Compatible with depth or breadth-first filtering.
pub fn detach_pis(_p: NodeRef<'_>, data: &mut NodeData) -> Action {
    if let NodeData::Pi(_) = data {
        Action::Detach
    } else {
        Action::Continue
    }
}

/// Filter out attributes that are not included in the "basic" set
/// [`TagMeta`](crate::html::TagMeta) for each element.
///
/// Compatible with depth or breadth-first filtering.
pub fn retain_basic_attributes(_p: NodeRef<'_>, data: &mut NodeData)
    -> Action
{
    if let Some(ref mut elm) = data.as_element_mut() {
        if let Some(tmeta) = TAG_META.get(&elm.name.local) {
            elm.attrs.retain(|a| tmeta.has_basic_attr(&a.name.local));
        } else {
            debug!("unknown tag {:?}, attributes unmodified", &elm.name.local);
        }
    }
    Action::Continue
}

/// Normalize text nodes by merging, replacing control characters and
/// minimizing whitespace.
///
/// The filter is aware of whitespace significance rules in HTML `<pre>` (or
/// similar tag) blocks as well as block vs inline elements in general. It
/// assumes, without knowledge of any potential unconventional external styling,
/// that leading and trailing whitespace may be removed at block element
/// boundaries.
///
/// Because this filter works on text nodes, results are better if applied in
/// its own `Document::filter` pass, depth first, and _after_ any pass
/// containing filters that detach or fold elements, such as
/// [`detach_banned_elements`] or [`fold_empty_inline`]. Otherwise the filter
/// may not be able to merge text node's which become siblings too late in the
/// process, resulting in additional unnecessary whitespace.
pub fn text_normalize(pos: NodeRef<'_>, data: &mut NodeData) -> Action {
    thread_local! {
        static MERGE_Q: RefCell<StrTendril> = RefCell::new(StrTendril::new())
    };

    if let Some(t) = data.as_text_mut() {
        // If the immediately following sibling is also text, then push this
        // tendril to the merge queue and detach.
        let node_r = pos.next_sibling();
        if node_r.map_or(false, |n| n.as_text().is_some()) {
            MERGE_Q.with(|q| {
                q.borrow_mut().push_tendril(t)
            });
            return Action::Detach;
        }

        // Otherwise add this tendril to anything in the queue, consuming it.
        MERGE_Q.with(|q| {
            let mut qt = q.borrow_mut();
            if qt.len() > 0 {
                qt.push_tendril(t);
                drop(qt);
                *t = q.replace(StrTendril::new());
            }
        });

        let parent = pos.parent().unwrap();
        let parent_is_block = is_block(parent);
        let in_pre = parent.node_and_ancestors().any(is_preform_node);

        let node_l = pos.prev_sibling();
        let trim_l = node_l.map_or(parent_is_block, is_block);
        let trim_r = node_r.map_or(parent_is_block, is_block);

        replace_chars(t, !in_pre, true, trim_l, trim_r);

        if t.is_empty() {
            return Action::Detach;
        }
    }
    Action::Continue
}

// FIXME: Consider also offering a simpler version of the above for XML or
// where speed trumps precision.

/// Convert any `<xmp>`, `<listing>`, or `<plaintext>` elements to `<pre>`.
///
/// The `<xmp>`, `<listing>` and `<plaintext>` tags are deprecated in later
/// HTML versions and are XML/XHTML incompatible, but can still can be found in
/// the wild.  After the HTML parse where special internal markup rules are
/// applied, these are roughly equivalent to `<pre>`, and its safer if
/// converted.
pub fn xmp_to_pre(_p: NodeRef<'_>, data: &mut NodeData) -> Action {
    if let Some(elm) = data.as_element_mut() {
        if is_preformatted(elm) {
            elm.name.local = t::PRE;
        }
    }
    Action::Continue
}

fn is_block(node: NodeRef<'_>) -> bool {
    if let Some(elm) = node.as_element() {
        if let Some(tmeta) = TAG_META.get(&elm.name.local) {
            return !tmeta.is_inline();
        }
    }
    false
}

// Note this isn't an exact negation of `is_block`: it still returns false for
// unknown elements.
fn is_inline(data: &NodeData) -> bool {
    if let Some(elm) = data.as_element() {
        if let Some(tmeta) = TAG_META.get(&elm.name.local) {
            return tmeta.is_inline();
        }
    }
    false
}

fn is_preformatted(e: &Element) -> bool {
    e.is_elem(t::PRE) || e.is_elem(t::XMP) || e.is_elem(t::PLAINTEXT)
}

fn is_preform_node(n: NodeRef<'_>) -> bool {
    n.is_elem(t::PRE) || n.is_elem(t::XMP) || n.is_elem(t::PLAINTEXT)
}

fn is_logical_ws(n: NodeRef<'_>) -> bool {
    if let Some(t) = n.as_text() {
        is_all_ctrl_ws(t)
    } else {
        n.is_elem(t::BR)
    }
}

fn is_multi_media(n: &NodeData) -> bool {
    /**/n.is_elem(t::AUDIO) ||
        n.is_elem(t::EMBED) ||
        n.is_elem(t::IFRAME) ||
        n.is_elem(t::IMG) ||
        n.is_elem(t::METER) ||
        n.is_elem(t::OBJECT) ||
        n.is_elem(t::PICTURE) ||
        n.is_elem(t::PROGRESS) ||
        n.is_elem(t::SVG) ||
        n.is_elem(t::VIDEO)
}