readability_fork/
extractor.rs

1use std::io::Read;
2use std::collections::BTreeMap;
3use std::path::Path;
4use std::cell::Cell;
5use markup5ever_rcdom::{RcDom, SerializableHandle};
6use html5ever::{parse_document, serialize};
7use html5ever::tendril::stream::TendrilSink;
8use std::default::Default;
9#[cfg(feature = "reqwest")]
10use std::time::Duration;
11#[cfg(feature = "reqwest")]
12use reqwest::Client;
13use url::Url;
14use scorer::Candidate;
15use crate::error::Error;
16use crate::dom;
17use crate::scorer;
18
19
20#[derive(Debug)]
21pub struct Product {
22    pub title:     String,
23    pub content:   String,
24    pub text:      String,
25}
26
27#[cfg(feature = "reqwest")]
28pub async fn scrape(url: &str) -> Result<Product, Error> {
29    let client = Client::builder()
30        .timeout(Duration::new(30, 0))
31        .build()?;
32    scrape_with_client(url, &client).await
33}
34
35#[cfg(feature = "reqwest")]
36pub async fn scrape_with_client(url: &str, client: &Client) -> Result<Product, Error> {
37    let res = client.get(url)
38        .send()
39        .await?;
40    if res.status().is_success() {
41        let url = Url::parse(url)?;
42        extract(&mut res.text().await?.as_bytes(), &url)
43    } else {
44        Err(Error::Unexpected)
45    }
46}
47
48pub fn extract<R>(input: &mut R, url: &Url) -> Result<Product, Error> where R: Read {
49    let mut dom = parse_document(RcDom::default(), Default::default())
50        .from_utf8()
51        .read_from(input)
52        .unwrap();
53    let mut title      = String::new();
54    let mut candidates = BTreeMap::new();
55    let mut nodes      = BTreeMap::new();
56    let handle = dom.document.clone();
57    scorer::preprocess(&mut dom, handle.clone(), &mut title);
58    scorer::find_candidates(&mut dom, Path::new("/"), handle.clone(), &mut candidates, &mut nodes);
59    let mut id: &str = "/";
60    let mut top_candidate: &Candidate = &Candidate {
61        node:  handle.clone(),
62        score: Cell::new(0.0),
63    };
64    for (i, c) in candidates.iter() {
65        let score = c.score.get() * (1.0 - scorer::get_link_density(c.node.clone()));
66        c.score.set(score);
67        if score <= top_candidate.score.get() {
68            continue;
69        }
70        id            = i;
71        top_candidate = c;
72    }
73    let mut bytes = vec![];
74
75    let node = &top_candidate.node;
76    scorer::clean(&mut dom, Path::new(id), node.clone(), url, &candidates);
77
78    serialize(&mut bytes, &SerializableHandle::from(node.clone()), Default::default()).ok();
79    let content = String::from_utf8(bytes).unwrap_or_default();
80
81    let mut text: String = String::new();
82    dom::extract_text(node.clone(), &mut text, true);
83    Ok(Product { title: title, content: content, text: text })
84}