readability/
extractor.rs

1use crate::{
2    dom,
3    error::ReadabilityError,
4    scorer::{self, Scorer, ScorerOptions, TopCandidate},
5    utils::{debug_candidate, debug_candidates},
6};
7use html5ever::{parse_document, serialize, tendril::stream::TendrilSink, ParseOpts};
8use log::{debug, trace};
9use markup5ever_rcdom::{Handle, RcDom, SerializableHandle};
10use scorer::Candidate;
11use std::{cell::Cell, collections::BTreeMap, default::Default, io::Read, path::Path};
12use url::Url;
13
14#[derive(Debug)]
15pub struct Readable {
16    pub title: String,
17    pub content: String,
18    pub text: String,
19}
20
21#[derive(Debug)]
22pub struct Content {
23    pub node: Handle,
24    pub title: String,
25}
26
27#[derive(Debug, Default)]
28pub struct ExtractOptions<'a> {
29    pub parse_options: ParseOptions,
30    pub scorer_options: ScorerOptions<'a>,
31}
32
33#[derive(Debug, Default)]
34pub struct ParseOptions {
35    pub strict: bool,
36}
37
38/// Extract content from an HTML reader.
39pub fn extract<R>(
40    input: &mut R,
41    url: &Url,
42    opts: ExtractOptions,
43) -> Result<Readable, ReadabilityError>
44where
45    R: Read,
46{
47    let mut dom = parse_document(RcDom::default(), ParseOpts::default())
48        .from_utf8()
49        .read_from(input)?;
50
51    if opts.parse_options.strict && !dom.errors.is_empty() {
52        return Err(ReadabilityError::ParseHtml(dom.errors));
53    }
54
55    let content = extract_content(&mut dom, url, opts);
56
57    let mut bytes = vec![];
58
59    serialize(
60        &mut bytes,
61        &SerializableHandle::from(content.node.clone()),
62        Default::default(),
63    )?;
64
65    let mut text: String = String::new();
66
67    dom::extract_text(content.node.clone(), &mut text, true);
68
69    let content_string = String::from_utf8(bytes).unwrap_or_default();
70
71    debug!("Extracted title: {}", content.title);
72    trace!("Extracted text: {text}");
73    trace!("Extracted content: {content_string}");
74
75    Ok(Readable {
76        title: content.title,
77        content: content_string,
78        text,
79    })
80}
81
82/// Extract content `Node` from DOM.
83pub fn extract_content(dom: &mut RcDom, url: &Url, opts: ExtractOptions) -> Content {
84    let mut title = String::new();
85    let mut candidates = BTreeMap::new();
86    let mut nodes = BTreeMap::new();
87    let handle = dom.document.clone();
88    let scorer = Scorer::new(opts.scorer_options);
89
90    scorer.preprocess(dom, handle.clone(), &mut title);
91    scorer.find_candidates(Path::new("/"), handle.clone(), &mut candidates, &mut nodes);
92
93    debug!("Found candidates: {}", candidates.values().len());
94    trace!("Found candidates: {:?}", debug_candidates(&candidates));
95
96    let top_candidate = scorer.find_top_candidate(&candidates).unwrap_or_else(|| {
97        TopCandidate::new(
98            "/",
99            Candidate {
100                node: handle.clone(),
101                score: Cell::new(0.0),
102            },
103        )
104    });
105
106    debug!(
107        "Found top candidate: {:?}",
108        debug_candidate(top_candidate.candidate())
109    );
110
111    scorer.clean(
112        dom,
113        Path::new(top_candidate.id()),
114        top_candidate.node().clone(),
115        url,
116        &candidates,
117    );
118
119    Content {
120        node: top_candidate.node().clone(),
121        title,
122    }
123}