omnivore-cli 0.1.0

Command-line interface for Omnivore web crawler
use anyhow::Result;
use clap::{CommandFactory, Parser, Subcommand};
use colored::*;
use indicatif::{ProgressBar, ProgressStyle};
use omnivore_core::{
    crawler::Crawler,
    CrawlConfig, PolitenessConfig,
};
use std::path::PathBuf;
use tokio;
use url::Url;

#[derive(Parser)]
#[command(name = "omnivore")]
#[command(author, version, about = "Universal Rust Web Crawler & Knowledge Graph Builder", long_about = None)]
struct Cli {
    #[command(subcommand)]
    command: Commands,

    #[arg(short, long, global = true)]
    verbose: bool,

    #[arg(short, long, value_name = "FILE", global = true)]
    config: Option<PathBuf>,
}

#[derive(Subcommand)]
enum Commands {
    Crawl {
        #[arg(help = "URL to start crawling from")]
        url: String,

        #[arg(short, long, default_value = "10")]
        workers: usize,

        #[arg(short, long, default_value = "5")]
        depth: u32,

        #[arg(short, long, help = "Output file for results")]
        output: Option<PathBuf>,

        #[arg(long, help = "Respect robots.txt")]
        respect_robots: bool,

        #[arg(long, default_value = "100", help = "Delay between requests in milliseconds")]
        delay: u64,
    },

    Parse {
        #[arg(help = "File to parse")]
        file: PathBuf,

        #[arg(short, long, help = "Parsing rules file")]
        rules: Option<PathBuf>,
    },

    Graph {
        #[arg(help = "Build knowledge graph from crawl results")]
        input: PathBuf,

        #[arg(short, long, help = "Output graph file")]
        output: Option<PathBuf>,
    },

    Stats {
        #[arg(help = "Show statistics for a crawl session")]
        session: Option<String>,
    },

    #[command(hide = true)]
    GenerateCompletions {
        #[arg(help = "Shell to generate completions for")]
        shell: clap_complete::Shell,
    },
}

#[tokio::main]
async fn main() -> Result<()> {
    let cli = Cli::parse();

    tracing_subscriber::fmt()
        .with_env_filter(if cli.verbose { "debug" } else { "info" })
        .init();

    match cli.command {
        Commands::Crawl {
            url,
            workers,
            depth,
            output,
            respect_robots,
            delay,
        } => {
            crawl_command(url, workers, depth, output, respect_robots, delay).await?;
        }
        Commands::Parse { file, rules } => {
            parse_command(file, rules).await?;
        }
        Commands::Graph { input, output } => {
            graph_command(input, output).await?;
        }
        Commands::Stats { session } => {
            stats_command(session).await?;
        }
        Commands::GenerateCompletions { shell } => {
            generate_completions(shell);
        }
    }

    Ok(())
}

async fn crawl_command(
    url: String,
    workers: usize,
    depth: u32,
    output: Option<PathBuf>,
    respect_robots: bool,
    delay: u64,
) -> Result<()> {
    println!("{}", "🕸️  Omnivore Web Crawler".bold().cyan());
    println!();

    let start_url = Url::parse(&url)?;
    println!("Starting crawl from: {}", start_url.to_string().green());
    println!("Configuration:");
    println!("  Workers: {}", workers.to_string().yellow());
    println!("  Max depth: {}", depth.to_string().yellow());
    println!("  Respect robots.txt: {}", respect_robots.to_string().yellow());
    println!("  Delay: {}ms", delay.to_string().yellow());
    println!();

    let config = CrawlConfig {
        max_workers: workers,
        max_depth: depth,
        user_agent: "Omnivore/1.0".to_string(),
        respect_robots_txt: respect_robots,
        politeness: PolitenessConfig {
            default_delay_ms: delay,
            max_requests_per_second: 1000.0 / delay as f64,
            backoff_multiplier: 2.0,
        },
        timeout_ms: 30000,
        max_retries: 3,
    };

    use std::sync::Arc;
    let crawler: Arc<Crawler> = Arc::new(Crawler::new(config).await?);
    crawler.add_seed(start_url).await?;

    let progress = ProgressBar::new_spinner();
    progress.set_style(
        ProgressStyle::default_spinner()
            .tick_chars("⠁⠂⠄⡀⢀⠠⠐⠈ ")
            .template("{spinner:.green} [{elapsed_precise}] {msg}")?,
    );

    let stats_handle = tokio::spawn({
        let crawler = Arc::clone(&crawler);
        let progress = progress.clone();
        async move {
            loop {
                let stats = crawler.get_stats().await;
                progress.set_message(format!(
                    "Crawled: {} | Success: {} | Failed: {} | In Progress: {}",
                    stats.total_urls.to_string().cyan(),
                    stats.successful.to_string().green(),
                    stats.failed.to_string().red(),
                    stats.in_progress.to_string().yellow()
                ));
                tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
            }
        }
    });

    let crawler = Arc::clone(&crawler);
    crawler.start().await?;
    stats_handle.abort();
    progress.finish_with_message("Crawl completed!");

    let final_stats = crawler.get_stats().await;
    println!();
    println!("{}", "📊 Final Statistics:".bold().green());
    println!("  Total URLs crawled: {}", final_stats.total_urls.to_string().cyan());
    println!("  Successful: {}", final_stats.successful.to_string().green());
    println!("  Failed: {}", final_stats.failed.to_string().red());
    println!("  Time elapsed: {:?}", final_stats.elapsed_time);

    if let Some(output_path) = output {
        let stats_json = serde_json::to_string_pretty(&final_stats)?;
        tokio::fs::write(output_path, stats_json).await?;
        println!("Results saved to file");
    }

    Ok(())
}

async fn parse_command(file: PathBuf, rules: Option<PathBuf>) -> Result<()> {
    println!("{}", "📄 Parsing HTML file...".bold().cyan());
    
    let content = tokio::fs::read_to_string(&file).await?;
    
    if let Some(rules_path) = rules {
        println!("Using rules from: {}", rules_path.display());
    }

    let parser = omnivore_core::parser::Parser::new(
        omnivore_core::parser::ParseConfig {
            rules: vec![],
            schema_name: None,
            clean_text: true,
            extract_metadata: true,
        }
    );

    let result = parser.parse(&content)?;
    println!();
    println!("{}", "✅ Parsing complete!".bold().green());
    println!("{}", serde_json::to_string_pretty(&result)?);

    Ok(())
}

async fn graph_command(input: PathBuf, output: Option<PathBuf>) -> Result<()> {
    println!("{}", "🔗 Building knowledge graph...".bold().cyan());
    
    let data = tokio::fs::read_to_string(&input).await?;
    
    println!("Processing {} bytes of data", data.len());

    if let Some(output_path) = output {
        println!("Graph will be saved to: {}", output_path.display());
    }

    println!("{}", "✅ Graph building complete!".bold().green());

    Ok(())
}

async fn stats_command(session: Option<String>) -> Result<()> {
    println!("{}", "📊 Crawl Statistics".bold().cyan());
    
    if let Some(session_id) = session {
        println!("Session: {}", session_id.yellow());
    } else {
        println!("No active sessions found");
    }

    Ok(())
}

fn generate_completions(shell: clap_complete::Shell) {
    use clap_complete::{generate, Generator};
    use std::io;
    
    fn print_completions<G: Generator>(gen: G, cmd: &mut clap::Command) {
        generate(gen, cmd, cmd.get_name().to_string(), &mut io::stdout());
    }

    let mut cmd = Cli::command();
    print_completions(shell, &mut cmd);
}