Crate dyer[][src]

dyer is designed for reliable, flexible and fast web-crawling, providing some high-level, comprehensive features without compromising speed.

By means of event-driven, non-blocking I/O tokio and powerful, reliable Rust programming language, inspired by scrapy, dyer provides some high-level features:

  • asynchronous, concurrent streamimg and I/O, make the best of thread pool, network, and system resource.
  • event-driven, once you set the initials and recursive generator of Task, dyer will handle the rest of it.
  • user-friendly, considering the philosophy of rust programming language, more source code, proper architecture may set up yourself in a dilemma when efficiency and learning cost are taken into consideration. dyer offers high-level,flexible wrappers and APIs what does a lot for you.

Walk you through an example

take Scrapy Tutorial as a guide to know the basics, step by step.

  • Add dyer as dependency in you Cargo.toml
  • Writing a struct and implementing Spider Trait, customize parser to extract data and generate recursive Task
  • Writing PipeLine and MiddleWare to process data if necessary

Add as Dependency

dyer is written in rust, normally put the following in Cargo.toml, and other libraries needed for further.

dyer = "0.1.1"
tokio = { version = "0.2", features = [ "macros", "rt-threaded" ] }
futures = "*"
serde = {version = "*", features = ["derive"] }
serde_json = "*"
select = "*"
log = "*"
simple_logger = "*"

Customization your code in your src/main.rs

extern crate dyer;
extern crate futures;
extern crate select;
extern crate serde;
extern crate serde_json;
extern crate tokio;
extern crate log;
extern crate simple_logger;

use dyer::{
    App, MiddleWare, ParseResult, PipeLine, Profile, Request, ResError, Response, Spider, Task,
};
use futures::future::{BoxFuture, FutureExt};
use serde::{Deserialize, Serialize};
use std::fmt::Debug;
use std::fs::OpenOptions;
use std::io::{LineWriter, Write};
use std::sync::{Arc, Mutex, Once};
use simple_logger::SimpleLogger;

type Stem<U> = Result<U, Box<dyn std::error::Error + Send + Sync>>;

// the data to be collected, make sure it is Deserializable and Serializable
#[derive(Deserialize, Serialize, Debug, Clone)]
pub struct Quote {
    pub text: String,
    pub author: String,
    pub tags: Vec<String>,
}

// use `Items` as a container of all possible Item.
#[derive(Serialize, Debug, Clone)]
pub enum Items {
    Quote(Quote),
}

// uri is enough to get the data, so generic parameter of `Task` and `Profile` are not necessary
// leave them as empty for the sake of appearance
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct Targ {}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct Parg {}

// here `select` is implemented to extract the data embodied in the HTML
// for more infomation about how to do that,
// "https://github.com/utkarshkukreti/select.rs" is recommanded to explore
pub fn parse_quote(res: Response<Targ, Parg>) -> ParseResult<Items, Targ, Parg>
{
    let mut r = ParseResult {
        task: vec![],
        profile: vec![res.profile],
        req: vec![],
        entities: vec![],
        yield_err: vec![],
    };
    if res.content.is_none() {
        // for the `Response` with empty content, recycle profile
        return r;
    }
    let mut quotes = Vec::new();
    let doc = select::document::Document::from(res.content.as_ref().unwrap().as_str());
    for node in doc.find(select::predicate::Class("quote")) {
        let text = node
            .find(select::predicate::Class("text"))
            .next()
            .unwrap()
            .text();
        let author = node
            .find(select::predicate::Class("author"))
            .next()
            .unwrap()
            .text();
        let tags = node
            .find(select::predicate::Class("tag"))
            .map(|tag| tag.text())
            .collect::<Vec<String>>();
        let item = Quote { text, author, tags };
        quotes.push(Items::Quote(item));
    }
    r.entities = quotes;

    // follow the next page if exists
    let mut next_node = doc.find(select::predicate::Class("next"));
    if let Some(nd) = next_node.next() {
        // next page exists
        let next_url = nd
            .find(select::predicate::Name("a"))
            .next()
            .unwrap()
            .attr("href")
            .unwrap();
        let mut task = Task::<Targ>::default();
        task.uri = format!("https://quotes.toscrape.com{}", next_url);
        task.parser = "parse_quote".to_string();
        r.task.push(task);
    }
    r
}

pub struct Spd {}
// implementing `Spider` for `Spd`
impl Spider<Items, Targ, Parg> for Spd {
    // preparation before opening spider
    // nothing to do in this case
    fn open_spider(&self, _app: &mut App<Items, Targ, Parg>) {}

    // preparation before closing spider
    // nothing to do in this case
    fn close_spider(&self, _app: &mut App<Items, Targ, Parg>) {}

    // set up parser that extracts `Quote` from the `Response`
    fn get_parser<'a>(
        &self,
        ind: String,
    ) -> Option<&'a dyn Fn(Response<Targ, Parg>) -> ParseResult<Items, Targ, Parg>> {
        if &ind == "parse_quote" {
            return Some(&parse_quote);
        }
        None
    }

    // `Task` to be executed when starting `dyer`
    fn entry_task(&self) -> Stem<Vec<Task<Targ>>> {
        let mut task = Task::default();

        // all infomation needed is uri and parser
        task.uri = "https://quotes.toscrape.com".to_string();
        task.parser = "parse_quote".to_string();
        Ok(vec![task])
    }

    // the generator of `Profile`
    // `dyer` consume the returned `Request`, generate a `Response` fed to the closure
    // to generate a `Profile`
    fn entry_profile(
        &self,
    ) -> (
        Request<Targ, Parg>,
        Option<
            &(dyn Fn(&mut Response<Targ, Parg>) -> BoxFuture<'_, Result<Profile<Parg>, ResError>>
                  + Send
                  + Sync),
        >,
    ) {
        let mut req = Request::<Targ, Parg>::default();
        req.task.uri = "https://quotes.toscrape.com".to_string();
        // as the domain suggests this site is specially built for crawling,
        // you do not have to pretend as a real device
        // leave the `Profile` generator as empty
        (req, None)
    }
}

// open a static file `result.json`
async fn open_file(path: &str) -> &'static Option<std::fs::File> {
    static INIT: Once = Once::new();
    static mut VAL: Option<std::fs::File> = None;
    unsafe {
        INIT.call_once(|| {
            let file = OpenOptions::new()
                .create(true)
                .write(true)
                .append(true)
                .open(path)
                .unwrap();
            VAL = Some(file);
        });
        &VAL
    }
}
// store Items into file
async fn store_item(items: &mut Arc<Mutex<Vec<Items>>>) {
    let mut ser_items = Vec::new();
    let items_len = items.lock().unwrap().len();
    for _ in 0..items_len {
        let item = items.lock().unwrap().pop().unwrap();
        let s = serde_json::to_string(&item).unwrap();
        ser_items.push(s);
    }
    let stream = ser_items.join("\n");
    let mut writer = LineWriter::new(open_file("result.json").await.as_ref().unwrap());
    writer.write(&stream.as_bytes()).unwrap();
}

#[tokio::main]
async fn main() {
    // for the sake of clarification, use simple_logger to display some level-varied infomation
    // initialize simple_logger
    SimpleLogger::new()
    .with_level(log::LevelFilter::Info)
    .with_module_level("dyer", log::LevelFilter::Debug)
    .init()
    .unwrap();
    static SPD: Spd = Spd {};
    // since the `Quote` collected by parse_quote is complete, `MiddleWare` is not necessary,
    let middleware = MiddleWare::<Items, Targ, Parg>::builder().build();
    // writing a `PipeLine` to store them
    // for short, handle `Items` only
    let pipeline = PipeLine::<Items, std::fs::File>::builder()
        .process_item(&|items: &mut Arc<Mutex<Vec<Items>>>| store_item(items).boxed_local())
        .build();

    // construct the app and start the crawler
    let mut app: App<Items, Targ, Parg> = App::<Items, Targ, Parg>::new();
    // donot use history, such as `Request`, `Profile`, `Task`
    app.rt_args.lock().unwrap().skip_history = true;
    app.run(&SPD, &middleware, pipeline).await;
    Ok(())
}

As you expected, It is Done.

Re-exports

pub use component::*;
pub use engine::App;
pub use engine::AppArg;
pub use plugin::MiddleWare;
pub use plugin::PipeLine;
pub use plugin::Spider;

Modules

component
engine
plugin