spider_cli 1.7.14

Multithreaded web crawler written in Rust.
extern crate spider;
extern crate env_logger;

pub mod options;

use clap::Parser;
use options::{Cli, Commands};
use spider::website::Website;
use std::io::{self, Write};

fn main() {
    let cli = Cli::parse();

    if cli.verbose {
        use env_logger::Env;
        let env = Env::default()
            .filter_or("RUST_LOG", "info")
            .write_style_or("RUST_LOG_STYLE", "always");

        env_logger::init_from_env(env);
    }

    let mut website: Website = Website::new(&cli.domain);
    
    let delay = cli.delay.unwrap_or(website.configuration.delay);
    let concurrency = cli.concurrency.unwrap_or(website.configuration.concurrency);
    let user_agent = cli.user_agent.unwrap_or(website.configuration.user_agent.to_string());
    let blacklist_url = cli.blacklist_url.unwrap_or_default();

    website.configuration.respect_robots_txt = cli.respect_robots_txt;
    website.configuration.delay = delay;
    website.configuration.concurrency = concurrency;

    if !blacklist_url.is_empty() {
        let blacklist_url: Vec<String> = blacklist_url.split(",").map(|l| l.to_string()).collect();
        website.configuration.blacklist_url.extend(blacklist_url);
    }

    if !user_agent.is_empty() {
        website.configuration.user_agent = Box::leak(user_agent.to_owned().into_boxed_str());
    }

    match &cli.command {
        Some(Commands::CRAWL { sync, output_links }) => {
            if *sync {
                website.crawl_sync();
            } else {
                website.crawl();
            }

            if *output_links {
                let links: Vec<_> = website.get_links().iter().collect();
                io::stdout().write_all(format!("{:?}", links).as_bytes()).unwrap();
            }

        }
        None => {}
    }
}