ragcli 0.1.0

CLI for local RAG
use crate::config::{ensure_store_layout, load_or_create_config, store_dir};
use crate::store::{
    collect_store_stats, connect_db, load_metadata, StoreMetadata, StoreStats, DEFAULT_TABLE_NAME,
};
use crate::ui::{self, Panel};
use anyhow::Result;
use futures::TryStreamExt;
use lancedb::query::ExecutableQuery;
use serde::Serialize;
use std::path::Path;
use walkdir::WalkDir;

#[derive(Debug, Serialize)]
pub struct DiskUsageReport {
    pub total_bytes: u64,
    pub lancedb_bytes: u64,
    pub meta_bytes: u64,
    pub cache_bytes: u64,
    pub models_bytes: u64,
}

#[derive(Debug, Serialize)]
pub struct StatReport {
    pub store: String,
    pub ollama_url: String,
    pub stats: StoreStats,
    pub metadata: Option<StoreMetadata>,
    pub disk_usage: DiskUsageReport,
    pub warnings: Vec<String>,
}

pub async fn run(name: Option<&str>, json: bool) -> Result<()> {
    let report = build_report(name).await?;
    if json {
        println!("{}", serde_json::to_string_pretty(&report)?);
    } else {
        print_human(&report);
    }
    Ok(())
}

async fn build_report(name: Option<&str>) -> Result<StatReport> {
    let store = store_dir(name)?;
    ensure_store_layout(&store)?;
    let cfg = load_or_create_config(&store)?;
    let metadata = load_metadata(&store).ok();
    let db = connect_db(&store).await?;

    let table = match db.open_table(DEFAULT_TABLE_NAME).execute().await {
        Ok(table) => Some(table),
        Err(_) => None,
    };

    let mut batches = Vec::new();
    if let Some(table) = &table {
        batches = table
            .query()
            .execute()
            .await?
            .try_collect::<Vec<_>>()
            .await?;
    }

    let stats = collect_store_stats(&batches, 5)?;
    let (disk_usage, warnings) = collect_disk_usage(&store);

    Ok(StatReport {
        store: store.display().to_string(),
        ollama_url: cfg.ollama.base_url,
        stats,
        metadata,
        disk_usage,
        warnings,
    })
}

fn print_human(report: &StatReport) {
    ui::command_header("ragcli stat", "");

    let mut summary = Panel::new("Store Summary");
    summary.kv("store", &report.store, 13);
    summary.kv("ollama url", &report.ollama_url, 13);
    summary.kv("rows", report.stats.total_chunks.to_string(), 13);
    summary.kv("sources", report.stats.unique_sources.to_string(), 13);
    summary.kv("pdf pages", report.stats.pdf_pages.to_string(), 13);
    summary.kv("chars", fmt_count(report.stats.total_chars), 13);
    summary.kv(
        "tokens",
        format!("~{}", fmt_count(report.stats.estimated_tokens)),
        13,
    );
    summary.render();

    println!();
    let mut content = Panel::new("Content Mix");
    content.kv("text", report.stats.content_kinds.text_files.to_string(), 8);
    content.kv("pdf", report.stats.content_kinds.pdf_files.to_string(), 8);
    content.kv(
        "image",
        report.stats.content_kinds.image_files.to_string(),
        8,
    );
    content.kv(
        "other",
        report.stats.content_kinds.other_files.to_string(),
        8,
    );
    if report.stats.total_chunks > 0 {
        content.kv(
            "avg",
            format!(
                "{} chars, ~{} tokens",
                report.stats.total_chars / report.stats.total_chunks,
                report.stats.estimated_tokens / report.stats.total_chunks
            ),
            8,
        );
        content.kv(
            "range",
            format!(
                "{}..{} chars",
                report.stats.min_chunk_chars, report.stats.max_chunk_chars
            ),
            8,
        );
    }
    content.render();

    println!();
    let mut storage = Panel::new("Storage");
    storage.kv("total", fmt_bytes(report.disk_usage.total_bytes), 8);
    storage.kv("lancedb", fmt_bytes(report.disk_usage.lancedb_bytes), 8);
    storage.kv("meta", fmt_bytes(report.disk_usage.meta_bytes), 8);
    storage.kv("cache", fmt_bytes(report.disk_usage.cache_bytes), 8);
    storage.kv("models", fmt_bytes(report.disk_usage.models_bytes), 8);
    if let Some(metadata) = &report.metadata {
        storage.kv(
            "embed",
            format!("{} (dim {})", metadata.embed_model, metadata.embedding_dim),
            8,
        );
        storage.kv(
            "chunking",
            format!(
                "size {}, overlap {}",
                metadata.chunk_size, metadata.chunk_overlap
            ),
            8,
        );
    } else {
        storage.kv("embed", ui::warn("metadata missing"), 8);
    }
    storage.render();

    if !report.warnings.is_empty() {
        println!();
        let mut warnings = Panel::new("Warnings");
        for warning in &report.warnings {
            warnings.prose("warning", warning, 8);
        }
        warnings.render();
    }

    if !report.stats.top_sources.is_empty() {
        println!();
        ui::render_table(
            "Top Sources",
            &["Source", "Chunks", "Tokens"],
            report
                .stats
                .top_sources
                .iter()
                .map(|source| {
                    vec![
                        source.source_path.clone(),
                        fmt_count(source.chunks),
                        format!("~{}", fmt_count(source.estimated_tokens)),
                    ]
                })
                .collect(),
        );
    }
}

#[cfg(test)]
pub fn dir_size_bytes(path: &Path) -> Result<u64> {
    if !path.exists() {
        return Ok(0);
    }

    if path.is_file() {
        return Ok(path.metadata()?.len());
    }

    let mut total = 0_u64;
    for entry in walkdir::WalkDir::new(path) {
        let entry = entry?;
        if entry.file_type().is_file() {
            total += entry.metadata()?.len();
        }
    }
    Ok(total)
}

fn collect_disk_usage(store: &Path) -> (DiskUsageReport, Vec<String>) {
    let mut report = DiskUsageReport {
        total_bytes: 0,
        lancedb_bytes: 0,
        meta_bytes: 0,
        cache_bytes: 0,
        models_bytes: 0,
    };
    let mut warnings = Vec::new();

    if !store.exists() {
        return (report, warnings);
    }

    for entry in WalkDir::new(store) {
        let entry = match entry {
            Ok(entry) => entry,
            Err(err) => {
                warnings.push(format!("skipped unreadable path: {err}"));
                continue;
            }
        };
        if !entry.file_type().is_file() {
            continue;
        }

        let bytes = match entry.metadata() {
            Ok(metadata) => metadata.len(),
            Err(err) => {
                warnings.push(format!(
                    "skipped unreadable file {}: {}",
                    entry.path().display(),
                    err
                ));
                continue;
            }
        };
        report.total_bytes += bytes;

        let Ok(relative_path) = entry.path().strip_prefix(store) else {
            continue;
        };
        let Some(first_component) = relative_path.components().next() else {
            continue;
        };
        let Some(name) = first_component.as_os_str().to_str() else {
            continue;
        };

        match name {
            "lancedb" => report.lancedb_bytes += bytes,
            "meta" => report.meta_bytes += bytes,
            "cache" => report.cache_bytes += bytes,
            "models" => report.models_bytes += bytes,
            _ => {}
        }
    }

    (report, warnings)
}

pub fn fmt_bytes(bytes: u64) -> String {
    const UNITS: [&str; 5] = ["B", "KB", "MB", "GB", "TB"];
    let mut value = bytes as f64;
    let mut unit = 0;
    while value >= 1024.0 && unit < UNITS.len() - 1 {
        value /= 1024.0;
        unit += 1;
    }

    if unit == 0 {
        format!("{} {}", bytes, UNITS[unit])
    } else {
        format!("{value:.1} {}", UNITS[unit])
    }
}

pub fn fmt_count(value: usize) -> String {
    let digits = value.to_string();
    let mut out = String::with_capacity(digits.len() + digits.len() / 3);
    let len = digits.len();
    for (idx, ch) in digits.chars().enumerate() {
        if idx > 0 && (len - idx).is_multiple_of(3) {
            out.push(',');
        }
        out.push(ch);
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::config::STORE_SUBDIRECTORIES;
    use crate::test_support::with_test_env;

    #[test]
    fn test_helpers_format_counts_and_sizes() {
        assert_eq!(fmt_bytes(999), "999 B");
        assert_eq!(fmt_bytes(2048), "2.0 KB");
        assert_eq!(fmt_count(12), "12");
        assert_eq!(fmt_count(1234), "1,234");
        assert_eq!(fmt_count(1234567), "1,234,567");
    }

    #[test]
    fn test_helpers_measure_directory_sizes() {
        let dir = tempfile::tempdir().unwrap();
        let file = dir.path().join("a.txt");
        let nested = dir.path().join("nested");
        let nested_file = nested.join("b.txt");
        std::fs::create_dir_all(&nested).unwrap();
        std::fs::write(&file, b"abc").unwrap();
        std::fs::write(&nested_file, b"12345").unwrap();

        assert_eq!(dir_size_bytes(&dir.path().join("missing")).unwrap(), 0);
        assert_eq!(dir_size_bytes(&file).unwrap(), 3);
        assert_eq!(dir_size_bytes(dir.path()).unwrap(), 8);
    }

    #[test]
    fn test_collect_disk_usage_walks_store_once_and_keeps_root_files() {
        let dir = tempfile::tempdir().unwrap();
        let store = dir.path().join("store");
        std::fs::create_dir_all(&store).unwrap();
        for subdirectory in STORE_SUBDIRECTORIES {
            std::fs::create_dir_all(store.join(subdirectory)).unwrap();
        }
        std::fs::write(store.join("config.toml"), b"abc").unwrap();
        std::fs::write(store.join("lancedb").join("data.bin"), b"12345").unwrap();
        std::fs::write(store.join("meta").join("store.toml"), b"12").unwrap();
        std::fs::write(store.join("cache").join("blob.bin"), b"1234").unwrap();
        std::fs::write(store.join("models").join("weights.gguf"), b"123456").unwrap();
        std::fs::create_dir_all(store.join("other")).unwrap();
        std::fs::write(store.join("other").join("extra.bin"), b"1234567").unwrap();

        let (report, warnings) = collect_disk_usage(&store);

        assert!(warnings.is_empty());
        assert_eq!(report.total_bytes, 27);
        assert_eq!(report.lancedb_bytes, 5);
        assert_eq!(report.meta_bytes, 2);
        assert_eq!(report.cache_bytes, 4);
        assert_eq!(report.models_bytes, 6);
    }

    #[tokio::test(flavor = "current_thread")]
    async fn test_build_report_supports_empty_store() {
        let dir = tempfile::tempdir().unwrap();
        with_test_env(dir.path(), None, || async {
            let report = build_report(Some("empty")).await.unwrap();
            assert_eq!(report.stats.total_chunks, 0);
            assert!(serde_json::to_string(&report)
                .unwrap()
                .contains("\"stats\""));
        })
        .await;
    }
}