article_extractor/full_text_parser/readability/
mod.rs

1mod state;
2
3#[cfg(test)]
4mod tests;
5
6use self::state::State;
7use super::error::FullTextParserError;
8use crate::{constants, util::Util};
9use libxml::tree::{Document, Node};
10use std::cmp::Ordering;
11
12/// Rust port of mozilla readability algorithm
13///
14/// Used as fallback for `ArticleScraper` if no fitting config can be found
15pub struct Readability;
16
17impl Readability {
18    /// Parse HTML and extract meaningful content
19    ///
20    /// # Arguments
21    ///
22    /// * `html` - HTML of a website containing an article or similar content
23    /// * `base_url` - URL used to complete relative URLs
24    ///
25    pub fn extract(html: &str, base_url: Option<url::Url>) -> Result<String, FullTextParserError> {
26        libxml::tree::node::set_node_rc_guard(10);
27        let empty_config = crate::full_text_parser::config::ConfigEntry::default();
28
29        let url =
30            base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
31        let document = crate::FullTextParser::parse_html(html, None, &empty_config)?;
32        let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document)?;
33
34        crate::FullTextParser::prep_content(&xpath_ctx, None, &empty_config, &url, &document, None);
35        let mut article = crate::article::Article {
36            title: None,
37            author: None,
38            url,
39            date: None,
40            thumbnail_url: None,
41            html: None,
42        };
43
44        let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
45        let mut root =
46            Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
47        article_document.set_root_element(&root);
48
49        crate::full_text_parser::metadata::extract(&xpath_ctx, None, None, &mut article);
50        super::Readability::extract_body(document, &mut root, article.title.as_deref())?;
51        crate::FullTextParser::post_process_document(&article_document)?;
52
53        let html = Util::serialize_node(&article_document, &root);
54        article.html.replace(html.clone());
55
56        Ok(html)
57    }
58
59    pub(crate) fn extract_body(
60        document: Document,
61        root: &mut Node,
62        title: Option<&str>,
63    ) -> Result<bool, FullTextParserError> {
64        let mut state = State::default();
65        let mut document = document;
66        let mut attempts: Vec<(Node, usize, Document)> = Vec::new();
67        let document_cache = document
68            .dup()
69            .map_err(|()| FullTextParserError::Readability)?;
70
71        loop {
72            let mut elements_to_score = Vec::new();
73            let mut node: Option<Node> = document.clone().get_root_element();
74
75            while let Some(node_ref) = node.as_mut() {
76                let tag_name = node_ref.get_name().to_uppercase();
77
78                if tag_name == "TEXT" && node_ref.get_content().trim().is_empty() {
79                    node = Util::next_node(node_ref, true);
80                    continue;
81                }
82
83                let match_string = Util::get_signature(node_ref);
84
85                if !Util::is_probably_visible(node_ref) {
86                    log::debug!("removing hidden node {match_string}");
87                    node = Util::remove_and_next(node_ref);
88                    continue;
89                }
90
91                if Self::check_byline(node_ref, &match_string, &mut state) {
92                    node = Util::remove_and_next(node_ref);
93                    continue;
94                }
95
96                if state.should_remove_title_header
97                    && Util::header_duplicates_title(node_ref, title)
98                {
99                    state.should_remove_title_header = false;
100                    node = Util::remove_and_next(node_ref);
101                    continue;
102                }
103
104                // Remove unlikely candidates
105                if state.strip_unlikely {
106                    if constants::UNLIELY_CANDIDATES.is_match(&match_string)
107                        && !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string)
108                        && !Util::has_ancestor_tag(
109                            node_ref,
110                            "table",
111                            None,
112                            None::<fn(&Node) -> bool>,
113                        )
114                        && !Util::has_ancestor_tag(
115                            node_ref,
116                            "code",
117                            None,
118                            None::<fn(&Node) -> bool>,
119                        )
120                        && tag_name != "BODY"
121                        && tag_name != "A"
122                    {
123                        node = Util::remove_and_next(node_ref);
124                        continue;
125                    }
126
127                    if let Some(role) = node_ref.get_attribute("role") {
128                        if constants::UNLIKELY_ROLES.contains(&role.as_str()) {
129                            node = Util::remove_and_next(node_ref);
130                            continue;
131                        }
132                    }
133                }
134
135                // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
136                if (tag_name == "DIV"
137                    || tag_name == "SECTION"
138                    || tag_name == "HEADER"
139                    || tag_name == "H1"
140                    || tag_name == "H2"
141                    || tag_name == "H3"
142                    || tag_name == "H4"
143                    || tag_name == "H5"
144                    || tag_name == "H6")
145                    && Util::is_element_without_content(node_ref)
146                {
147                    node = Util::remove_and_next(node_ref);
148                    continue;
149                }
150
151                if constants::DEFAULT_TAGS_TO_SCORE.contains(&tag_name.as_str()) {
152                    elements_to_score.push(node_ref.clone());
153                }
154
155                // Turn all divs that don't have children block level elements into p's
156                if tag_name == "DIV" {
157                    // Put phrasing content into paragraphs.
158                    let mut p: Option<Node> = None;
159                    for mut child in node_ref.get_child_nodes().into_iter() {
160                        if child.is_null() {
161                            continue;
162                        }
163
164                        if Util::is_phrasing_content(&child) {
165                            if let Some(p) = p.as_mut() {
166                                child.unlink();
167                                p.add_child(&mut child).map_err(|error| {
168                                    log::error!("{error}");
169                                    FullTextParserError::Readability
170                                })?;
171                            } else if !Util::is_whitespace(&child) {
172                                let mut new_node = Node::new("p", None, &document)
173                                    .map_err(|()| FullTextParserError::Readability)?;
174                                let mut old_node = node_ref
175                                    .replace_child_node(new_node.clone(), child)
176                                    .map_err(|error| {
177                                        log::error!("{error}");
178                                        FullTextParserError::Readability
179                                    })?;
180
181                                new_node.add_child(&mut old_node).map_err(|error| {
182                                    log::error!("{error}");
183                                    FullTextParserError::Readability
184                                })?;
185                                p.replace(new_node);
186                            }
187                        } else if p.is_some() {
188                            if let Some(p) = p.as_mut() {
189                                for mut r_node in p.get_child_nodes().into_iter().rev() {
190                                    if r_node.is_null() {
191                                        continue;
192                                    }
193
194                                    if Util::is_whitespace(&r_node) {
195                                        r_node.unlink();
196                                        continue;
197                                    }
198                                    break;
199                                }
200                            }
201                            _ = p.take();
202                        }
203                    }
204
205                    // Sites like http://mobile.slate.com encloses each paragraph with a DIV
206                    // element. DIVs with only a P element inside and no text content can be
207                    // safely converted into plain P elements to avoid confusing the scoring
208                    // algorithm with DIVs with are, in practice, paragraphs.
209                    if Util::has_single_tag_inside_element(node_ref, "P")
210                        && Util::get_link_density(node_ref) < 0.25
211                    {
212                        if let Some(new_node) = node_ref.get_first_element_child() {
213                            if let Some(mut parent) = node_ref.get_parent() {
214                                parent
215                                    .replace_child_node(new_node.clone(), node_ref.clone())
216                                    .map_err(|error| {
217                                        log::error!("{error}");
218                                        FullTextParserError::Readability
219                                    })?;
220                                node = Util::next_node(&new_node, false);
221                                elements_to_score.push(new_node.clone());
222                                continue;
223                            }
224                        }
225                    } else if !Util::has_child_block_element(node_ref)
226                        && node_ref.set_name("P").is_ok()
227                    {
228                        elements_to_score.push(node_ref.clone());
229                    }
230                }
231
232                node = Util::next_node(node_ref, false);
233            }
234
235            let mut candidates = Vec::new();
236            // Loop through all paragraphs, and assign a score to them based on how content-y they look.
237            // Then add their score to their parent node.
238            // A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
239            for element_to_score in elements_to_score.drain(..) {
240                if element_to_score.get_parent().is_none() {
241                    continue;
242                }
243
244                let inner_text = Util::get_inner_text(&element_to_score, true);
245                let inner_text_len = inner_text.len();
246
247                // If this paragraph is less than 25 characters, don't even count it.
248                if inner_text_len < 25 {
249                    continue;
250                }
251
252                // Exclude nodes with no ancestor.
253                let ancestors = Util::get_node_ancestors(&element_to_score, Some(5));
254                if ancestors.is_empty() {
255                    continue;
256                }
257
258                let mut content_score = 0.0;
259
260                // Add a point for the paragraph itself as a base.
261                content_score += 1.0;
262
263                // Add points for any commas within this paragraph.
264                content_score += inner_text.split(',').count() as f64;
265
266                // For every 100 characters in this paragraph, add another point. Up to 3 points.
267                content_score += f64::min(f64::floor(inner_text.len() as f64 / 100.0), 3.0);
268
269                // Initialize and score ancestors.
270                for (level, mut ancestor) in ancestors.into_iter().enumerate() {
271                    let tag_name = ancestor.get_name().to_uppercase();
272
273                    if ancestor.get_parent().is_none() || tag_name == "HTML" {
274                        continue;
275                    }
276
277                    if Self::get_content_score(&ancestor).is_none() {
278                        Self::initialize_node(&mut ancestor, &state)?;
279                        candidates.push(ancestor.clone());
280                    }
281
282                    // Node score divider:
283                    // - parent:             1 (no division)
284                    // - grandparent:        2
285                    // - great grandparent+: ancestor level * 3
286                    let score_divider = if level == 0 {
287                        1.0
288                    } else if level == 1 {
289                        2.0
290                    } else {
291                        level as f64 * 3.0
292                    };
293
294                    if let Some(score) = Self::get_content_score(&ancestor) {
295                        let add_score = content_score / score_divider;
296                        let new_score = score + add_score;
297                        log::debug!(
298                            "{}: {score} + {add_score} = {new_score}",
299                            ancestor.get_name()
300                        );
301                        Self::set_content_score(&mut ancestor, new_score)?;
302                    }
303                }
304            }
305
306            //Util::serialize_document(&document, "dbg.html");
307
308            // After we've calculated scores, loop through all of the possible
309            // candidate nodes we found and find the one with the highest score.
310            for candidate in candidates.iter_mut() {
311                // Scale the final candidates score based on link density. Good content
312                // should have a relatively small link density (5% or less) and be mostly
313                // unaffected by this operation.
314                if let Some(content_score) = Self::get_content_score(candidate) {
315                    let candidate_score = content_score * (1.0 - Util::get_link_density(candidate));
316                    Self::set_content_score(candidate, candidate_score)?;
317                }
318            }
319
320            candidates.sort_by(|a, b| {
321                if let (Some(a), Some(b)) = (Self::get_content_score(a), Self::get_content_score(b))
322                {
323                    b.partial_cmp(&a).unwrap_or(Ordering::Equal)
324                } else {
325                    Ordering::Equal
326                }
327            });
328
329            let top_candidates = candidates.into_iter().take(5).collect::<Vec<_>>();
330
331            for candidate in top_candidates.iter() {
332                log::debug!(
333                    "candidate: {} {:?}",
334                    candidate.get_name(),
335                    candidate.get_attributes()
336                );
337            }
338            let mut needed_to_create_top_candidate = false;
339            let mut top_candidate = if let Some(t) = top_candidates.first() {
340                t.clone()
341            } else {
342                // If we still have no top candidate, just use the body as a last resort.
343                // We also have to copy the body node so it is something we can modify.
344                let Some(mut root) = document.get_root_element() else {
345                    log::error!("document has no root element");
346                    return Err(FullTextParserError::Xml);
347                };
348
349                if let Some(body) = root
350                    .get_child_elements()
351                    .into_iter()
352                    .find(|n| n.get_name().to_uppercase() == "BODY")
353                {
354                    root = body;
355                }
356
357                let mut new_top_candidate =
358                    Node::new("DIV", None, &document).expect("can't create new node");
359
360                for mut child in root.get_child_elements().drain(..) {
361                    if child.is_null() {
362                        continue;
363                    }
364
365                    child.unlink();
366                    new_top_candidate.add_child(&mut child).unwrap();
367                }
368
369                root.add_child(&mut new_top_candidate).unwrap();
370
371                Self::initialize_node(&mut new_top_candidate, &state)
372                    .expect("init should not fail");
373                needed_to_create_top_candidate = true;
374                new_top_candidate
375            };
376
377            // Util::serialize_node(&top_candidate, "top_candidate.html");
378
379            let mut alternative_candidate_ancestors = Vec::new();
380            // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
381            // and whose scores are quite closed with current `topCandidate` node.
382            if let Some(top_score) = Self::get_content_score(&top_candidate) {
383                for candidate in top_candidates.iter().skip(1) {
384                    let score = Self::get_content_score(candidate).unwrap_or(0.0);
385                    if score / top_score >= 0.75 {
386                        alternative_candidate_ancestors
387                            .push(Util::get_node_ancestors(candidate, None));
388                    }
389                }
390            }
391
392            if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES {
393                let mut parent_of_top_candidate = top_candidate.get_parent();
394
395                while let Some(parent) = &parent_of_top_candidate {
396                    if parent.get_name().to_uppercase() == "BODY" {
397                        break;
398                    }
399
400                    let mut lists_containing_this_ancestor = 0;
401                    let tmp = usize::min(
402                        alternative_candidate_ancestors.len(),
403                        constants::MINIMUM_TOPCANDIDATES,
404                    );
405                    for ancestors in alternative_candidate_ancestors.iter().take(tmp) {
406                        lists_containing_this_ancestor +=
407                            ancestors.iter().filter(|n| n == &parent).count();
408                    }
409
410                    if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
411                        top_candidate = parent.clone();
412                        break;
413                    }
414
415                    parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
416                }
417            }
418
419            if Self::get_content_score(&top_candidate).is_none() {
420                Self::initialize_node(&mut top_candidate, &state)?;
421            }
422
423            //Util::serialize_node(&top_candidate, "new_top_candidate.html");
424
425            // Because of our bonus system, parents of candidates might have scores
426            // themselves. They get half of the node. There won't be nodes with higher
427            // scores than our topCandidate, but if we see the score going *up* in the first
428            // few steps up the tree, that's a decent sign that there might be more content
429            // lurking in other places that we want to unify in. The sibling stuff
430            // below does some of that - but only if we've looked high enough up the DOM
431            // tree.
432            let mut parent_of_top_candidate = top_candidate.get_parent();
433            let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0);
434
435            // The scores shouldn't get too low.
436            let score_threshold = last_score / 3.0;
437
438            while parent_of_top_candidate.is_some()
439                && !Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
440            {
441                if parent_of_top_candidate
442                    .as_ref()
443                    .map(|n| Self::get_content_score(n).is_none())
444                    .unwrap_or(false)
445                {
446                    parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
447                    continue;
448                }
449
450                let parent_score = parent_of_top_candidate
451                    .as_ref()
452                    .and_then(Self::get_content_score)
453                    .unwrap_or(0.0);
454                if parent_score < score_threshold {
455                    break;
456                }
457
458                if parent_score > last_score {
459                    // Alright! We found a better parent to use.
460                    if let Some(parent) = parent_of_top_candidate {
461                        top_candidate = parent;
462                    }
463                    break;
464                }
465
466                last_score = parent_of_top_candidate
467                    .as_ref()
468                    .and_then(Self::get_content_score)
469                    .unwrap_or(0.0);
470                parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
471            }
472
473            // If the top candidate is the only child, use parent instead. This will help sibling
474            // joining logic when adjacent content is actually located in parent's sibling node.
475            parent_of_top_candidate = top_candidate.get_parent();
476
477            while !Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
478                && parent_of_top_candidate
479                    .as_ref()
480                    .map(|n| n.get_child_elements().len() == 1)
481                    .unwrap_or(false)
482            {
483                top_candidate = parent_of_top_candidate.ok_or(FullTextParserError::Readability)?;
484                parent_of_top_candidate = top_candidate.get_parent();
485            }
486
487            if Self::get_content_score(&top_candidate).is_none() {
488                Self::initialize_node(&mut top_candidate, &state)?;
489            }
490
491            // Now that we have the top candidate, look through its siblings for content
492            // that might also be related. Things like preambles, content split by ads
493            // that we removed, etc.
494            let mut article_content =
495                Node::new("DIV", None, &document).map_err(|()| FullTextParserError::Readability)?;
496
497            let sibling_score_threshold = f64::max(
498                10.0,
499                Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2,
500            );
501            // Keep potential top candidate's parent node to try to get text direction of it later.
502            parent_of_top_candidate = top_candidate.get_parent();
503            let siblings = parent_of_top_candidate
504                .as_ref()
505                .map(|n| n.get_child_elements());
506
507            if let Some(mut siblings) = siblings {
508                for mut sibling in siblings.drain(..) {
509                    if sibling.is_null() {
510                        continue;
511                    }
512
513                    let mut append = false;
514
515                    let score = Self::get_content_score(&sibling).unwrap_or(0.0);
516                    log::debug!(
517                        "Looking at sibling node: {} ({:?}) with score {score}",
518                        sibling.get_name(),
519                        sibling.get_attribute("class")
520                    );
521
522                    if top_candidate == sibling {
523                        append = true;
524                    } else {
525                        let mut content_bonus = 0.0;
526
527                        // Give a bonus if sibling nodes and top candidates have the example same classname
528                        let sibling_classes = sibling.get_class_names();
529                        let tc_classes = top_candidate.get_class_names();
530
531                        if !tc_classes.is_empty()
532                            && !sibling_classes.is_empty()
533                            && sibling_classes
534                                .iter()
535                                .all(|class| tc_classes.contains(class))
536                        {
537                            content_bonus +=
538                                Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;
539                        }
540
541                        if score + content_bonus >= sibling_score_threshold {
542                            append = true;
543                        } else if sibling.get_name().to_uppercase() == "P" {
544                            let link_density = Util::get_link_density(&sibling);
545                            let node_content = Util::get_inner_text(&sibling, true);
546                            let node_length = node_content.len();
547
548                            if node_length > 80
549                                && (link_density < 0.25
550                                    || (node_length > 0
551                                        && link_density == 0.0
552                                        && constants::SIBLING_CONTENT.is_match(&node_content)))
553                            {
554                                append = true;
555                            }
556                        }
557                    }
558
559                    if append {
560                        log::debug!(
561                            "Appending node: {} ({:?})",
562                            sibling.get_name(),
563                            sibling.get_attribute("class")
564                        );
565
566                        if !constants::ALTER_TO_DIV_EXCEPTIONS
567                            .contains(sibling.get_name().to_uppercase().as_str())
568                        {
569                            // We have a node that isn't a common block level element, like a form or td tag.
570                            // Turn it into a div so it doesn't get filtered out later by accident.
571                            log::debug!(
572                                "Altering sibling: {} ({:?})",
573                                sibling.get_name(),
574                                sibling.get_attribute("class")
575                            );
576
577                            sibling.set_name("DIV").map_err(|error| {
578                                log::error!("{error}");
579                                FullTextParserError::Readability
580                            })?;
581                        }
582
583                        sibling.unlink();
584                        article_content.add_child(&mut sibling).map_err(|error| {
585                            log::error!("{error}");
586                            FullTextParserError::Readability
587                        })?;
588                    }
589                }
590            }
591
592            if state.clean_conditionally {
593                crate::FullTextParser::post_process_page(&mut article_content)?;
594            }
595
596            if needed_to_create_top_candidate {
597                // We already created a fake div thing, and there wouldn't have been any siblings left
598                // for the previous loop, so there's no point trying to create a new div, and then
599                // move all the children over. Just assign IDs and class names here. No need to append
600                // because that already happened anyway.
601                top_candidate
602                    .set_property("id", "readability-page-1")
603                    .map_err(|error| {
604                        log::error!("{error}");
605                        FullTextParserError::Readability
606                    })?;
607            } else {
608                let mut div = Node::new("DIV", None, &document)
609                    .map_err(|()| FullTextParserError::Readability)?;
610                div.set_property("id", "readability-page-1")
611                    .map_err(|error| {
612                        log::error!("{error}");
613                        FullTextParserError::Readability
614                    })?;
615
616                for mut child in article_content.get_child_nodes() {
617                    if child.is_null() {
618                        continue;
619                    }
620
621                    child.unlink();
622                    div.add_child(&mut child).map_err(|error| {
623                        log::error!("{error}");
624                        FullTextParserError::Readability
625                    })?;
626                }
627                article_content.add_child(&mut div).map_err(|error| {
628                    log::error!("{error}");
629                    FullTextParserError::Readability
630                })?;
631            }
632
633            let mut parse_successful = true;
634
635            // Now that we've gone through the full algorithm, check to see if
636            // we got any meaningful content. If we didn't, we may need to re-run
637            // grabArticle with different flags set. This gives us a higher likelihood of
638            // finding the content, and the sieve approach gives us a higher likelihood of
639            // finding the -right- content.
640            let text = Util::get_inner_text(&article_content, true);
641            let text_length = text.len();
642
643            if text_length < constants::DEFAULT_CHAR_THRESHOLD {
644                parse_successful = false;
645
646                if state.strip_unlikely {
647                    state.strip_unlikely = false;
648                    attempts.push((article_content, text_length, document));
649                } else if state.weigh_classes {
650                    state.weigh_classes = false;
651                    attempts.push((article_content, text_length, document));
652                } else if state.clean_conditionally {
653                    state.clean_conditionally = false;
654                    attempts.push((article_content, text_length, document));
655                } else {
656                    attempts.push((article_content, text_length, document));
657                    // No luck after removing flags, just return the longest text we found during the different loops
658
659                    attempts.sort_by(|(_, size_a, _), (_, size_b, _)| size_a.cmp(size_b));
660
661                    // But first check if we actually have something
662                    if let Some((best_attempt, _len, _document)) = attempts.pop() {
663                        for mut child in best_attempt.get_child_nodes() {
664                            if child.is_null() {
665                                continue;
666                            }
667
668                            child.unlink();
669                            root.add_child(&mut child).map_err(|error| {
670                                log::error!("{error}");
671                                FullTextParserError::Readability
672                            })?;
673                        }
674                        parse_successful = true;
675                    }
676
677                    return Ok(parse_successful);
678                }
679
680                document = document_cache
681                    .dup()
682                    .map_err(|()| FullTextParserError::Readability)?;
683            } else {
684                for mut child in article_content.get_child_nodes() {
685                    if child.is_null() {
686                        continue;
687                    }
688
689                    child.unlink();
690                    root.add_child(&mut child).map_err(|error| {
691                        log::error!("{error}");
692                        FullTextParserError::Readability
693                    })?;
694                }
695                return Ok(parse_successful);
696            }
697        }
698    }
699
700    fn get_content_score(node: &Node) -> Option<f64> {
701        node.get_attribute(constants::SCORE_ATTR)
702            .and_then(|a| a.parse::<f64>().ok())
703    }
704
705    fn set_content_score(node: &mut Node, score: f64) -> Result<(), FullTextParserError> {
706        node.set_attribute(constants::SCORE_ATTR, &score.to_string())
707            .map_err(|err| {
708                log::error!("failed to set content score: {err}");
709                FullTextParserError::Readability
710            })
711    }
712
713    fn check_byline(node: &Node, matchstring: &str, state: &mut State) -> bool {
714        if state.byline.is_some() {
715            return false;
716        }
717
718        let rel = node
719            .get_attribute("rel")
720            .map(|rel| rel == "author")
721            .unwrap_or(false);
722        let itemprop = node
723            .get_attribute("itemprop")
724            .map(|prop| prop.contains("author"))
725            .unwrap_or(false);
726
727        let content = node.get_content();
728        if rel
729            || itemprop
730            || constants::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content)
731        {
732            state.byline = Some(content.trim().into());
733            true
734        } else {
735            false
736        }
737    }
738
739    // Check whether the input string could be a byline.
740    // This verifies that the input length is less than 100 chars.
741    fn is_valid_byline(line: &str) -> bool {
742        let len = line.trim().len();
743        len > 0 && len < 100
744    }
745
746    // Initialize a node with the readability object. Also checks the
747    // className/id for special names to add to its score.
748    fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
749        let score = match node.get_name().to_uppercase().as_str() {
750            "DIV" => 5,
751            "PRE" | "TD" | "BLOCKQUITE" => 3,
752            "ADDRESS" | "OL" | "UL" | "DL" | "DD" | "DT" | "LI" | "FORM" => -3,
753            "H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "TH" => -5,
754            _ => 0,
755        };
756        let class_weight = if state.weigh_classes {
757            Util::get_class_weight(node)
758        } else {
759            0
760        };
761        let score = score + class_weight;
762        log::debug!(
763            "initialize node {} {}: {score}",
764            node.get_name(),
765            node.get_attribute("class").unwrap_or_default()
766        );
767        Self::set_content_score(node, score as f64)?;
768        Ok(())
769    }
770}