1use crate::{
2 dom,
3 error::ReadabilityError,
4 scorer::{self, Scorer, ScorerOptions, TopCandidate},
5 utils::{debug_candidate, debug_candidates},
6};
7use html5ever::{parse_document, serialize, tendril::stream::TendrilSink, ParseOpts};
8use log::{debug, trace};
9use markup5ever_rcdom::{Handle, RcDom, SerializableHandle};
10use scorer::Candidate;
11use std::{cell::Cell, collections::BTreeMap, default::Default, io::Read, path::Path};
12use url::Url;
13
14#[derive(Debug)]
15pub struct Readable {
16 pub title: String,
17 pub content: String,
18 pub text: String,
19}
20
21#[derive(Debug)]
22pub struct Content {
23 pub node: Handle,
24 pub title: String,
25}
26
27#[derive(Debug, Default)]
28pub struct ExtractOptions<'a> {
29 pub parse_options: ParseOptions,
30 pub scorer_options: ScorerOptions<'a>,
31}
32
33#[derive(Debug, Default)]
34pub struct ParseOptions {
35 pub strict: bool,
36}
37
38pub fn extract<R>(
40 input: &mut R,
41 url: &Url,
42 opts: ExtractOptions,
43) -> Result<Readable, ReadabilityError>
44where
45 R: Read,
46{
47 let mut dom = parse_document(RcDom::default(), ParseOpts::default())
48 .from_utf8()
49 .read_from(input)?;
50
51 if opts.parse_options.strict && !dom.errors.is_empty() {
52 return Err(ReadabilityError::ParseHtml(dom.errors));
53 }
54
55 let content = extract_content(&mut dom, url, opts);
56
57 let mut bytes = vec![];
58
59 serialize(
60 &mut bytes,
61 &SerializableHandle::from(content.node.clone()),
62 Default::default(),
63 )?;
64
65 let mut text: String = String::new();
66
67 dom::extract_text(content.node.clone(), &mut text, true);
68
69 let content_string = String::from_utf8(bytes).unwrap_or_default();
70
71 debug!("Extracted title: {}", content.title);
72 trace!("Extracted text: {text}");
73 trace!("Extracted content: {content_string}");
74
75 Ok(Readable {
76 title: content.title,
77 content: content_string,
78 text,
79 })
80}
81
82pub fn extract_content(dom: &mut RcDom, url: &Url, opts: ExtractOptions) -> Content {
84 let mut title = String::new();
85 let mut candidates = BTreeMap::new();
86 let mut nodes = BTreeMap::new();
87 let handle = dom.document.clone();
88 let scorer = Scorer::new(opts.scorer_options);
89
90 scorer.preprocess(dom, handle.clone(), &mut title);
91 scorer.find_candidates(Path::new("/"), handle.clone(), &mut candidates, &mut nodes);
92
93 debug!("Found candidates: {}", candidates.values().len());
94 trace!("Found candidates: {:?}", debug_candidates(&candidates));
95
96 let top_candidate = scorer.find_top_candidate(&candidates).unwrap_or_else(|| {
97 TopCandidate::new(
98 "/",
99 Candidate {
100 node: handle.clone(),
101 score: Cell::new(0.0),
102 },
103 )
104 });
105
106 debug!(
107 "Found top candidate: {:?}",
108 debug_candidate(top_candidate.candidate())
109 );
110
111 scorer.clean(
112 dom,
113 Path::new(top_candidate.id()),
114 top_candidate.node().clone(),
115 url,
116 &candidates,
117 );
118
119 Content {
120 node: top_candidate.node().clone(),
121 title,
122 }
123}