Crate dyer[][src]

Dyer is designed for reliable, flexible and fast web-crawling, providing some high-level, comprehensive features without compromising speed.

By means of event-driven, non-blocking I/O tokio and powerful, reliable [Rust programming language], inspired by scrapy, Dyer provides some high-level features.

  • asynchronous, concurrent streamimg and I/O, make the best of thread pool, network, and system resource.
  • event-driven, once you set the initials and recursive generator of Task, Dyer will handle the rest of it.
  • user-friendly, considering the philosophy of rust programming language, more source code, proper framework may contribute the awkwardness when efficiency and learning cost are taken into consideration. Dyer presents high-level wrapper for convenience.

Walk you through an example

take Scrapy Tutorial as a guide to know the basics, step by step.

  • Add Dyer as dependency in you Cargo.toml
  • Writing a struct and implementing Spider Trait, customize parser to extract data and generate recursive Task
  • Writing PipeLine and MiddleWare to process data if necessary

Add as Dependency

Dyer is written in rust, normally put the following in Cargo.toml, and other libraries needed for further.

dyer = "0.1.0"
tokio = { version = "0.2", features = [ "macros", "rt-threaded" ] }
serde = {version = "*", features = ["derive"] }
serde_json = "*"
select = "*"

Customization your code in your src/main.rs

extern crate dyer;
extern crate futures;
extern crate select;
extern crate serde;
extern crate serde_json;
extern crate tokio;

use dyer::{
    App, MiddleWare, ParseResult, PipeLine, Profile, Request, ResError, Response, Spider, Task,
};
use futures::future::{BoxFuture, FutureExt};
use serde::{Deserialize, Serialize};
use std::fmt::Debug;
use std::io::{LineWriter, Write};
use std::sync::{Arc, Mutex, Once};

type Stem<U> = Result<U, Box<dyn std::error::Error + Send + Sync>>;

// the data to be collected, make sure it is Deserializable and Serializable
#[derive(Deserialize, Serialize, Debug, Clone)]
pub struct Quote {
    pub text: String,
    pub author: String,
    pub tags: Vec<String>,
}

// use `Items` as a container of all possible Item.
#[derive(Serialize, Debug, Clone)]
pub enum Items {
    Quote(Quote),
}

// uri is enough to get the data, so generic parameter of `Task` and `Profile` are not necessary
// leave them as empty for the sake of appearance
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct Targ {}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct Parg {}

// here `select` is implemented to extract the data embodied in the HTML
// for more infomation about how to do that,
// "https://github.com/utkarshkukreti/select.rs" is recommanded to explore
pub fn parse_quote(res: Response<Targ, Parg>) -> ParseResult<Items, Targ, Parg>
where
    Items: Serialize + Clone + Debug,
    Targ: Serialize + for<'de> Deserialize<'de> + Debug + Clone,
    Parg: Serialize + for<'de> Deserialize<'de> + Debug + Clone,
{
    let mut r = ParseResult {
        task: vec![],
        profile: vec![res.profile],
        req: vec![],
        entities: vec![],
        yield_err: vec![],
    };
    if res.content.is_none() {
        // for the `Response` with empty content, recycle profile
        return r;
    }
    let mut quotes = Vec::new();
    let doc = select::document::Document::from(res.content.as_ref().unwrap().as_str());
    for node in doc.find(select::predicate::Class("quote")) {
        let text = node
            .find(select::predicate::Class("text"))
            .next()
            .unwrap()
            .text();
        let author = node
            .find(select::predicate::Class("author"))
            .next()
            .unwrap()
            .text();
        let tags = node
            .find(select::predicate::Class("tag"))
            .map(|tag| tag.text())
            .collect::<Vec<String>>();
        let item = Quote { text, author, tags };
        quotes.push(Items::Quote(item));
    }
    r.entities = quotes;

    // follow the next page if exists
    let mut next_node = doc.find(select::predicate::Class("next"));
    if next_node.next().is_some() {
        // next page exists
        let next_url = next_node
            .next()
            .unwrap()
            .find(select::predicate::Name("a"))
            .next()
            .unwrap()
            .attr("href")
            .unwrap();
        let mut task = Task::<Targ>::default();
        task.uri = format!("https://quotes.toscrape.com{}", next_url);
        r.task.push(task);
    }
    r
}

pub struct Spd {}
// implementing `Spider` for `Spd`
impl Spider<Items, Targ, Parg> for Spd {
    // preparation before opening spider
    fn open_spider(&self, _app: &mut App<Items, Targ, Parg>) {}

    // preparation before closing spider
    fn close_spider(&self, _app: &mut App<Items, Targ, Parg>) {}

    // set up parser that extracts `Quote` from the `Response`
    fn get_parser<'a>(
        &self,
        ind: String,
    ) -> Option<&'a dyn Fn(Response<Targ, Parg>) -> ParseResult<Items, Targ, Parg>> {
        if &ind == "parse_quote" {
            return Some(&parse_quote);
        }
        None
    }

    // `Task` executed when starting `dyer`
    fn entry_task(&self) -> Stem<Vec<Task<Targ>>> {
        let mut task = Task::default();

        // all infomation needed is uri and parser
        task.uri = "quotes.toscrape.com".to_string();
        task.parser = "parse_quote".to_string();
        Ok(vec![task])
    }

    // the generator of `Profile`
    fn entry_profile(
        &self,
    ) -> (
        Request<Targ, Parg>,
        Option<
            &(dyn Fn(&mut Response<Targ, Parg>) -> BoxFuture<'_, Result<Profile<Parg>, ResError>>
                  + Send
                  + Sync),
        >,
    ) {
        let mut req = Request::<Targ, Parg>::default();
        req.task.uri = "quotes.toscrape.com".to_string();
        // this site is specially for newbies, you do not have to pretend as a real device
        (req, None)
    }
}

use std::fs::OpenOptions;
// open a static file
async fn open_file(path: &str) -> &'static Option<std::fs::File> {
    static INIT: Once = Once::new();
    static mut VAL: Option<std::fs::File> = None;
    unsafe {
        INIT.call_once(|| {
            let file = OpenOptions::new()
                .create(true)
                .write(true)
                .append(true)
                .open(path)
                .unwrap();
            VAL = Some(file);
        });
        &VAL
    }
}
// store Items into file
async fn store_item(items: &mut Arc<Mutex<Vec<Items>>>) {
    let mut ser_items = Vec::new();
    let items_len = items.lock().unwrap().len();
    for _ in 0..items_len {
        let item = items.lock().unwrap().pop().unwrap();
        let s = serde_json::to_string(&item).unwrap();
        ser_items.push(s);
    }
    let stream = ser_items.join("\n");
    let mut file = LineWriter::new(open_file("result.json").await.as_ref().unwrap());
    file.write(&stream.as_bytes()).unwrap();
}

#[tokio::main]
async fn main() {
    static SPD: Spd = Spd {};
    // since the `Quote` collected by parse_quote is complete, `MiddleWare` is not necessary,
    let middleware = MiddleWare::<Items, Targ, Parg>::builder().build();
    // writing a `PipeLine` to store them
    // for short, handle `Items` only
    let pipeline = PipeLine::<Items, std::fs::File>::builder()
        .process_item(&|items: &mut Arc<Mutex<Vec<Items>>>| store_item(items).boxed_local())
        .build();

    // construct the app and start the crawler
    let mut app: App<Items, Targ, Parg> = App::<Items, Targ, Parg>::new();
    app.run(&SPD, &middleware, pipeline).await;
    Ok(())
}

As you expected, It is Done.

Re-exports

pub use component::*;
pub use engine::App;
pub use engine::AppArg;
pub use plugin::MiddleWare;
pub use plugin::PipeLine;
pub use plugin::Spider;

Modules

component
engine
plugin