1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
use std::io::Read;
use std::collections::BTreeMap;
use std::path::Path;
use std::cell::Cell;
use html5ever::rcdom::{RcDom};
use html5ever::{parse_document, serialize};
use html5ever::tendril::stream::TendrilSink;
use std::default::Default;
#[cfg(feature = "reqwest")]
use std::time::Duration;
#[cfg(feature = "reqwest")]
use reqwest::Client;
use url::Url;
use scorer::Candidate;
use crate::error::Error;
use crate::dom;
use crate::scorer;


#[derive(Debug)]
pub struct Product {
    pub title:     String,
    pub content:   String,
    pub text:      String,
}

#[cfg(feature = "reqwest")]
pub async fn scrape(url: &str) -> Result<Product, Error> {
    let client = Client::builder()
        .timeout(Duration::new(30, 0))
        .build()?;
    scrape_with_client(url, &client).await
}

#[cfg(feature = "reqwest")]
pub async fn scrape_with_client(url: &str, client: &Client) -> Result<Product, Error> {
    let res = client.get(url)
        .send()
        .await?;
    if res.status().is_success() {
        let url = Url::parse(url)?;
        extract(&mut res.text().await?.as_bytes(), &url)
    } else {
        Err(Error::Unexpected)
    }
}

pub fn extract<R>(input: &mut R, url: &Url) -> Result<Product, Error> where R: Read {
    let mut dom = parse_document(RcDom::default(), Default::default())
        .from_utf8()
        .read_from(input)
        .unwrap();
    let mut title      = String::new();
    let mut candidates = BTreeMap::new();
    let mut nodes      = BTreeMap::new();
    let handle = dom.document.clone();
    scorer::preprocess(&mut dom, handle.clone(), &mut title);
    scorer::find_candidates(&mut dom, Path::new("/"), handle.clone(), &mut candidates, &mut nodes);
    let mut id: &str = "/";
    let mut top_candidate: &Candidate = &Candidate {
        node:  handle.clone(),
        score: Cell::new(0.0),
    };
    for (i, c) in candidates.iter() {
        let score = c.score.get() * (1.0 - scorer::get_link_density(c.node.clone()));
        c.score.set(score);
        if score <= top_candidate.score.get() {
            continue;
        }
        id            = i;
        top_candidate = c;
    }
    let mut bytes = vec![];

    let node = top_candidate.node.clone();
    scorer::clean(&mut dom, Path::new(id), node.clone(), url, &candidates);

    serialize(&mut bytes, &node, Default::default()).ok();
    let content = String::from_utf8(bytes).unwrap_or_default();

    let mut text: String = String::new();
    dom::extract_text(node.clone(), &mut text, true);
    Ok(Product { title: title, content: content, text: text })
}