coma 0.2.1

Coma is a lightweight command-line tool designed for crawling websites
use std::{
    process,
    sync::{Arc, Mutex},
};

use crate::scrapy::Browser;
use cli::Commands;
use colored::Colorize;
use scrapy::ScrapyError;
use tokio::{sync::Semaphore, task::JoinSet};

mod cli;
mod config;
mod graph;
mod scrapy;
mod state;
mod topology;

use config::Config;
use state::State;
use topology::Node;

static PERMITS: Semaphore = Semaphore::const_new(0);

async fn run() -> Result<(), Box<dyn std::error::Error>> {
    let config = Config::new()?;
    let mut state = State::new(Arc::clone(&config.root))?;
    println!("{:?}", config.root.lock().unwrap().url);
    PERMITS.add_permits(config.thread as usize);
    while state.pop_layer().is_some() {
        println!("=== Depth {} ===", state.current_depth);

        let mut handles = browse_layer(&mut state, &config).await?;
        collect(&mut state, &config, &mut handles).await?;
        if state.current_depth == config.target_depth {
            break;
        }
        state.current_depth += 1;
        println!();
    }

    if config.cmd == Commands::Graph {
        graph::render(&config.root).await?;
    }
    Ok(())
}

async fn browse_layer(
    state: &mut State,
    config: &Config,
) -> Result<JoinSet<(Result<Browser, ScrapyError>, Arc<Mutex<Node>>)>, Box<dyn std::error::Error>> {
    let mut handles = JoinSet::new();
    while let Some(node) = state.current_layer.pop() {
        if !config.same_domain(&node.lock().unwrap().url)
            || state.known(&node)
            || !config.in_bound(&node.lock().unwrap().url)
        {
            continue;
        }

        let permit = PERMITS.acquire().await?;
        println!("Visiting {}", node.lock().unwrap().url.as_str().green());
        handles.spawn(async move {
            let _permit = permit;
            let url = node.lock().unwrap().url.clone();
            (Browser::new_navigate(&url), node)
        });
    }
    Ok(handles)
}

async fn collect(
    state: &mut State,
    config: &Config,
    handles: &mut JoinSet<(Result<Browser, ScrapyError>, Arc<Mutex<Node>>)>,
) -> Result<(), Box<dyn std::error::Error>> {
    println!("Collecting data from every url of the layer");
    let mut total_count = 0;
    while let Some(handle) = handles.join_next().await {
        let (browser, parent) = handle?;
        let mut explore_external = false;
        let links = browser?
            .parse_document(config.cmd, &parent)
            .await
            .into_iter()
            .filter(|link| {
                if config.same_domain(link) {
                    return true;
                }
                if state.current_external < config.target_external {
                    explore_external = true;
                    return true;
                }
                false
            });
        parent.lock().unwrap().explore();

        let childs: Vec<Arc<Mutex<Node>>> = links
            .map(|url| Node::new_arc(Some(&parent), url.clone(), url.to_string()))
            .collect();
        total_count += parent.lock().unwrap().quantity_elements() + childs.len();
        state.add_to_next_layer(childs);
    }
    println!(
        "Found a total of {} {:?}",
        total_count.to_string().green(),
        config.cmd
    );
    Ok(())
}

fn main() {
    if let Ok(rt) = tokio::runtime::Runtime::new() {
        if let Err(e) = rt.block_on(run()) {
            eprintln!("{:?}", e);
            process::exit(1);
        }
        return;
    }
    eprintln!("Error: can't start the tokio runtime");
    process::exit(2);
}