1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
use std::io::Read; use std::collections::BTreeMap; use std::path::Path; use std::cell::Cell; use html5ever::rcdom::{RcDom}; use html5ever::{parse_document, serialize}; use html5ever::tendril::stream::TendrilSink; use std::default::Default; #[cfg(feature = "reqwest")] use std::time::Duration; #[cfg(feature = "reqwest")] use reqwest::Client; use url::Url; use scorer::Candidate; use crate::error::Error; use crate::dom; use crate::scorer; #[derive(Debug)] pub struct Product { pub title: String, pub content: String, pub text: String, } #[cfg(feature = "reqwest")] pub async fn scrape(url: &str) -> Result<Product, Error> { let client = Client::builder() .timeout(Duration::new(30, 0)) .build()?; scrape_with_client(url, &client).await } #[cfg(feature = "reqwest")] pub async fn scrape_with_client(url: &str, client: &Client) -> Result<Product, Error> { let res = client.get(url) .send() .await?; if res.status().is_success() { let url = Url::parse(url)?; extract(&mut res.text().await?.as_bytes(), &url) } else { Err(Error::Unexpected) } } pub fn extract<R>(input: &mut R, url: &Url) -> Result<Product, Error> where R: Read { let mut dom = parse_document(RcDom::default(), Default::default()) .from_utf8() .read_from(input) .unwrap(); let mut title = String::new(); let mut candidates = BTreeMap::new(); let mut nodes = BTreeMap::new(); let handle = dom.document.clone(); scorer::preprocess(&mut dom, handle.clone(), &mut title); scorer::find_candidates(&mut dom, Path::new("/"), handle.clone(), &mut candidates, &mut nodes); let mut id: &str = "/"; let mut top_candidate: &Candidate = &Candidate { node: handle.clone(), score: Cell::new(0.0), }; for (i, c) in candidates.iter() { let score = c.score.get() * (1.0 - scorer::get_link_density(c.node.clone())); c.score.set(score); if score <= top_candidate.score.get() { continue; } id = i; top_candidate = c; } let mut bytes = vec![]; let node = top_candidate.node.clone(); scorer::clean(&mut dom, Path::new(id), node.clone(), url, &candidates); serialize(&mut bytes, &node, Default::default()).ok(); let content = String::from_utf8(bytes).unwrap_or_default(); let mut text: String = String::new(); dom::extract_text(node.clone(), &mut text, true); Ok(Product { title: title, content: content, text: text }) }