readable-rs 0.1.2

A native Rust port of Mozilla's Readability algorithm for extracting readable content from HTML pages
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
mod cleaner;
mod scorer;
mod utils;

use crate::logging::logger::*;
use crate::logging::logging_defs::*;
use crate::models::ExtractOptions;
use crate::node_ext::NodeScoreStore;
use crate::parser::{NodeExt, NodeRef, new_html_element, parse_html};
use crate::utils::*;
use std::collections::HashSet;
use std::sync::LazyLock;
use utils::*;

pub static ALTER_TO_DIV_EXCEPTIONS: LazyLock<HashSet<&'static str>> =
    LazyLock::new(|| HashSet::from(["div", "article", "section", "p", "ol", "ul"]));

const DEFAULT_CONTENT_ID: &str = "readability-page-1";
const DEFAULT_CONTENT_CLASS: &str = "page";

/// The result of the article-content extraction phase.
pub struct ContentData {
    /// The extracted content subtree, wrapped in a `<div id="readability-page-1">`.
    pub content: NodeRef,
    /// The byline / author string detected during the cleaning pass, if any.
    pub by_line: String,
    /// The text-direction (`"ltr"` / `"rtl"`) inferred from the top
    /// candidate's ancestor `dir` attributes.  Empty when no `dir` was found.
    pub article_dir: String,
}

struct ContentParser<'a> {
    document: NodeRef,
    logger: &'a PerfLogger,
    options: ExtractOptions,
    debug: bool,
    article_title: String,
    initial_by_line: Option<String>,
}

impl<'a> ContentParser<'a> {
    fn new(
        document: NodeRef,
        article_title: String,
        options: ExtractOptions,
        initial_by_line: Option<String>,
        logger: &PerfLogger,
    ) -> ContentParser<'_> {
        let debug = options.debug;
        ContentParser {
            document,
            logger,
            options,
            debug,
            article_title,
            initial_by_line,
        }
    }

    fn remove_nodes_conditionally(
        &self,
        article_content: &NodeRef,
        tags_to_remove: &[&str],
        store: &NodeScoreStore,
    ) {
        if self.options.clean_conditionally {
            let link_density_modifier = self.options.link_density_modifier;
            for tag_name in tags_to_remove {
                let is_list = tag_name == &"ul" || tag_name == &"ol";
                remove_nodes(article_content, tag_name, |node, tag| {
                    is_safe_to_remove_node(
                        node,
                        tag,
                        is_list,
                        link_density_modifier,
                        self.logger,
                        store,
                    )
                });
            }
        }
    }

    fn prep_article(
        &self,
        article_content: &NodeRef,
        logger: &PerfLogger,
        store: &mut NodeScoreStore,
    ) {
        clean_presentation_styles(article_content);

        // Check for data tables before we continue, to avoid removing items in
        // those tables, which will often be isolated even though they're
        // visually linked to other content-ful elements (text, images, etc.).
        mark_data_tables_in_node(article_content, store);

        fix_lazy_images(article_content);

        // Clean out junk from the article content
        self.remove_nodes_conditionally(article_content, &["form", "fieldset"], store);

        for t in &["object", "embed", "iframe"] {
            remove_nodes(article_content, t, |node, tag| {
                !is_possibly_useful_video_node(node, tag)
            });
        }

        // Remove known gallery widgets that shouldn't appear in reader view.
        remove_nodes(article_content, "div", |node, _| {
            node.attr_value("data-scald-gallery").is_some()
        });

        for t in &["footer", "link", "aside"] {
            remove_nodes(article_content, t, |_, _| true);
        }

        // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
        // which means we don't remove the top candidates even they have "share".
        for n in article_content.children() {
            remove_matched_nodes(&n, |node, match_str| {
                let is_share_element = SHARE_ELEMENTS_REGEX.is_match(match_str);
                let smaller_than_threshold =
                    node.text_contents().chars().count() < self.options.char_threshold as usize;
                is_share_element && smaller_than_threshold
            });
        }

        for t in &["input", "textarea", "select", "button"] {
            remove_nodes(article_content, t, |_, _| true);
        }

        for t in &["h1", "h2"] {
            remove_nodes(article_content, t, |node, _| {
                get_class_and_id_weight(node) < 0
            });
        }

        // Do these last as the previous stuff may have removed junk
        // that will affect these
        self.remove_nodes_conditionally(article_content, &["table", "ul"], store);
        self.remove_nodes_conditionally(article_content, &["div"], store);
        for t in &["div", "section"] {
            remove_nodes(article_content, t, |node, _| {
                let inner = get_normalized_text_content(node, logger);
                matches_ad_or_loading(inner.as_str())
            });
        }
        rename_tags_with_selector(article_content, "h1", "h2");

        // Remove extra paragraphs
        remove_nodes(article_content, "p", |node, _| {
            let img = select_descendants(node, "img").len();
            let embed = select_descendants(node, "embed").len();
            let object = select_descendants(node, "object").len();
            // At this point, nasty iframes have been removed, only remain embedded video ones.
            let iframe = select_descendants(node, "iframe").len();
            let total_count = img + embed + object + iframe;
            total_count == 0 && node.text_contents().is_empty()
        });

        apply(article_content, &["br"], |br, _| {
            let next = next_element(Some(br.clone()));
            if next.is_some() && next.unwrap().element_name() == Some("p") {
                br.detach();
            }
        });

        apply(article_content, &["table"], |table, _| {
            let tbody = if contains_single_tag_in_element(table, "tbody") {
                table.first_element_child().unwrap()
            } else {
                table.clone()
            };

            if contains_single_tag_in_element(&tbody, "tr") {
                let row = tbody.first_element_child().unwrap();
                if contains_single_tag_in_element(&row, "td") {
                    let cell = row.first_element_child().unwrap();
                    let new_tag_name = if test_all_siblings(cell.first_child(), is_phrasing_content)
                    {
                        "p"
                    } else {
                        "div"
                    };

                    let cell = cell.clone_and_rename_element(new_tag_name);
                    table.insert_after(cell);
                    table.detach();
                }
            }
        });

        // For all elements
        // remove any element that is considered empty
        // this can be as a result of a previous clean up process
        article_content
            .descendants()
            .filter(|n| is_empty_node(n, logger))
            .for_each(|n| {
                if let Some(id) = n.attr_value("id") {
                    // here things are interesting
                    if let Some(name) = n.element_name() {
                        // 1- if the node is an "a", then we let that to the links resolution to deal with
                        // 2- if it a span element, spans are light weight element, let it be
                        // 3- otherwise, let's replace it with an empty span that holds the same id
                        if name != "a" && name != "span" {
                            let span = new_html_element("span");
                            let e = span.as_element().unwrap();
                            e.attributes.borrow_mut().insert("id", id);
                            n.insert_after(span);
                            n.detach();
                        }
                    }
                } else {
                    // none else cares about this node
                    // detach it from the nodes tree
                    n.detach();
                }
            });
    }

    fn get_content_node(&self) -> NodeRef {
        let content_node = self.document.select_first("body");
        if content_node.is_err() {
            if self.debug {
                let logger = self.logger;
                add_point_to_span_str!(logger, PARSE_CONTENT, "missing_body");
            }
            return self.document.clone();
        }

        let content_node = content_node.unwrap();
        content_node.as_node().clone()
    }

    fn update_or_create_top_candidate_node(
        &self,
        article_content: &NodeRef,
        top_candidate: &NodeRef,
        had_to_create_top_candidate_node: bool,
        _logger: &PerfLogger,
    ) {
        if had_to_create_top_candidate_node {
            // We already created a fake div thing, and there wouldn't have been any siblings left
            // for the previous loop, so there's no point trying to create a new div, and then
            // move all the children over. Just assign IDs and class names here. No need to append
            // because that already happened anyway.
            top_candidate
                .as_element()
                .unwrap()
                .attributes
                .borrow_mut()
                .insert("id", String::from(DEFAULT_CONTENT_ID));
            top_candidate
                .as_element()
                .unwrap()
                .attributes
                .borrow_mut()
                .insert("class", String::from(DEFAULT_CONTENT_CLASS));
        } else {
            let div = new_html_element("div");
            div.as_element()
                .unwrap()
                .attributes
                .borrow_mut()
                .insert("id", String::from(DEFAULT_CONTENT_ID));
            div.as_element()
                .unwrap()
                .attributes
                .borrow_mut()
                .insert("class", String::from(DEFAULT_CONTENT_CLASS));
            move_children(article_content, &div);
            article_content.append(div);
        }
    }

    pub fn parse(&mut self, store: &mut NodeScoreStore) -> Option<ContentData> {
        let logger = &self.logger;
        start_span!(logger, PARSE_CONTENT);
        let body_content = self.get_content_node();
        add_point_to_span_str!(
            logger,
            PARSE_CONTENT,
            "get_content_node_and_its_serialized_html_begin"
        );

        let (mut doc_node, content_cache) = (self.document.clone(), self.document.to_string());
        let mut by_line: Option<String> = self.initial_by_line.clone();
        let mut attempts: Vec<(NodeRef, usize)> = vec![];
        let mut article_dir: Option<String> = None;
        loop {
            let elements_to_score = cleaner::clean(
                &doc_node,
                &mut by_line,
                self.options.strip_unlikelys,
                self.article_title.as_str(),
                logger,
            );
            let scoring_res = scorer::score_elements(
                &elements_to_score,
                store,
                self.options.weight_classes,
                self.options.n_top_candidates as usize,
                &body_content,
                logger,
            );
            // Now that we have the top candidate, look through its siblings for content
            // that might also be related. Things like preambles, content split by ads
            // that we removed, etc.
            let mut article_content = new_html_element("div");
            let top_candidate = scoring_res.top_candidate;
            // Keep potential top candidate's parent node to try to get text direction of it later.
            let parent_of_top_candidate = top_candidate.parent();

            if let Some(p) = parent_of_top_candidate.clone() {
                for sibling in p.element_children() {
                    let append = if sibling == top_candidate {
                        true
                    } else {
                        should_append_sibling(&sibling, &top_candidate, logger, store)
                    };

                    if append {
                        if let Some(sibling_name) = sibling.element_name() {
                            let sibling_name = sibling_name.to_lowercase();
                            if !ALTER_TO_DIV_EXCEPTIONS.contains(sibling_name.as_str()) {
                                article_content.append(sibling.clone_and_rename_element("div"));
                            } else {
                                article_content.append(sibling);
                            }
                        } else {
                            article_content.append(sibling);
                        }
                    }
                }
            }

            self.prep_article(&article_content, logger, store);

            self.update_or_create_top_candidate_node(
                &article_content,
                &top_candidate,
                scoring_res.had_to_create_top_candidate_node,
                logger,
            );

            // Now that we've gone through the full algorithm, check to see if
            // we got any meaningful content. If we didn't, we may need to re-run
            // this method with different flags set. This gives us a higher likelihood of
            // finding the content, and the sieve approach gives us a higher likelihood of
            // finding the -right- content.
            let mut parse_successfully = true;
            let normalized_text = get_normalized_text_content(&article_content, logger);
            let text_length = normalized_text.chars().count();
            if text_length < self.options.char_threshold as usize {
                parse_successfully = false;
                doc_node = parse_html(content_cache.as_str());

                attempts.push((article_content.clone(), text_length));
                if self.options.strip_unlikelys {
                    self.options.strip_unlikelys = false;
                } else if self.options.weight_classes {
                    self.options.weight_classes = false;
                } else if self.options.clean_conditionally {
                    self.options.clean_conditionally = false;
                } else {
                    // No luck with all flags off,
                    // just return the longest text we found during the different loops
                    attempts.sort_by(|lhs, rhs| rhs.1.cmp(&lhs.1));

                    if let Some(attmpt) = attempts.first() {
                        if attmpt.1 == 0 {
                            let fallback_text = article_content.text_contents();
                            if !fallback_text.trim().is_empty() {
                                parse_successfully = true;
                            } else {
                                d!({
                                    eprintln!(
                                        "Failed to get readable article content with all flags turned off: {}",
                                        self.article_title
                                    )
                                });
                                return None;
                            }
                        } else {
                            article_content = attmpt.0.clone();
                            parse_successfully = true;
                        }
                    } else {
                        d!({
                            eprintln!(
                                "Failed to get readable article content with all flags turned off: {}",
                                self.article_title
                            )
                        });
                        return None;
                    }
                }
            }

            if parse_successfully {
                let mut ancestors = vec![top_candidate];
                if let Some(p) = parent_of_top_candidate {
                    ancestors.push(p.clone());
                    ancestors.append(&mut get_node_ancestors(&p, DEFAULT_MAX_ANCESTORS_DEPTH));
                }

                for n in ancestors {
                    if let (Some(_), Some(dir)) = (n.element_name(), n.attr_value("dir")) {
                        article_dir = Some(dir);
                        break;
                    }
                }

                return Some(ContentData {
                    content: article_content,
                    by_line: by_line.unwrap_or_default(),
                    article_dir: article_dir.unwrap_or_default(),
                });
            }
        }
    }
}

/// Run the full content-extraction pipeline on `document`.
///
/// Returns `Some(ContentData)` if a content subtree with enough text was
/// found, or `None` if all retry strategies (progressively disabling
/// `strip_unlikelys`, `weight_classes`, and `clean_conditionally`) were
/// exhausted without success.
pub fn get_content(
    document: NodeRef,
    options: ExtractOptions,
    article_title: String,
    initial_by_line: Option<String>,
    store: &mut NodeScoreStore,
    logger: &PerfLogger,
) -> Option<ContentData> {
    let mut parser = ContentParser::new(document, article_title, options, initial_by_line, logger);

    parser.parse(store)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::logging::logger::PerfLogger;
    use std::panic::{AssertUnwindSafe, catch_unwind};

    fn new_logger() -> PerfLogger {
        PerfLogger::new(vec![])
    }

    #[test]
    fn parse_no_body_does_not_panic_with_debug_false() {
        let doc = new_html_element("div");
        let logger = new_logger();
        let mut options = ExtractOptions::default();
        options.debug = false;
        let mut parser = ContentParser::new(doc, "title".to_string(), options, None, &logger);
        let mut store = NodeScoreStore::default();
        let res = catch_unwind(AssertUnwindSafe(|| parser.parse(&mut store)));
        assert!(res.is_ok());
    }

    #[test]
    fn parse_no_body_does_not_panic_with_debug_true() {
        let doc = new_html_element("div");
        let logger = new_logger();
        let mut options = ExtractOptions::default();
        options.debug = true;
        let mut parser = ContentParser::new(doc, "title".to_string(), options, None, &logger);
        let mut store = NodeScoreStore::default();
        let res = catch_unwind(AssertUnwindSafe(|| parser.parse(&mut store)));
        assert!(res.is_ok());
    }
}