readability_fork/
extractor.rs1use std::io::Read;
2use std::collections::BTreeMap;
3use std::path::Path;
4use std::cell::Cell;
5use markup5ever_rcdom::{RcDom, SerializableHandle};
6use html5ever::{parse_document, serialize};
7use html5ever::tendril::stream::TendrilSink;
8use std::default::Default;
9#[cfg(feature = "reqwest")]
10use std::time::Duration;
11#[cfg(feature = "reqwest")]
12use reqwest::Client;
13use url::Url;
14use scorer::Candidate;
15use crate::error::Error;
16use crate::dom;
17use crate::scorer;
18
19
20#[derive(Debug)]
21pub struct Product {
22 pub title: String,
23 pub content: String,
24 pub text: String,
25}
26
27#[cfg(feature = "reqwest")]
28pub async fn scrape(url: &str) -> Result<Product, Error> {
29 let client = Client::builder()
30 .timeout(Duration::new(30, 0))
31 .build()?;
32 scrape_with_client(url, &client).await
33}
34
35#[cfg(feature = "reqwest")]
36pub async fn scrape_with_client(url: &str, client: &Client) -> Result<Product, Error> {
37 let res = client.get(url)
38 .send()
39 .await?;
40 if res.status().is_success() {
41 let url = Url::parse(url)?;
42 extract(&mut res.text().await?.as_bytes(), &url)
43 } else {
44 Err(Error::Unexpected)
45 }
46}
47
48pub fn extract<R>(input: &mut R, url: &Url) -> Result<Product, Error> where R: Read {
49 let mut dom = parse_document(RcDom::default(), Default::default())
50 .from_utf8()
51 .read_from(input)
52 .unwrap();
53 let mut title = String::new();
54 let mut candidates = BTreeMap::new();
55 let mut nodes = BTreeMap::new();
56 let handle = dom.document.clone();
57 scorer::preprocess(&mut dom, handle.clone(), &mut title);
58 scorer::find_candidates(&mut dom, Path::new("/"), handle.clone(), &mut candidates, &mut nodes);
59 let mut id: &str = "/";
60 let mut top_candidate: &Candidate = &Candidate {
61 node: handle.clone(),
62 score: Cell::new(0.0),
63 };
64 for (i, c) in candidates.iter() {
65 let score = c.score.get() * (1.0 - scorer::get_link_density(c.node.clone()));
66 c.score.set(score);
67 if score <= top_candidate.score.get() {
68 continue;
69 }
70 id = i;
71 top_candidate = c;
72 }
73 let mut bytes = vec![];
74
75 let node = &top_candidate.node;
76 scorer::clean(&mut dom, Path::new(id), node.clone(), url, &candidates);
77
78 serialize(&mut bytes, &SerializableHandle::from(node.clone()), Default::default()).ok();
79 let content = String::from_utf8(bytes).unwrap_or_default();
80
81 let mut text: String = String::new();
82 dom::extract_text(node.clone(), &mut text, true);
83 Ok(Product { title: title, content: content, text: text })
84}