reasonkit-mem 0.1.7

High-performance vector database & RAG memory layer - hybrid search, embeddings, RAPTOR trees, BM25 fusion, and semantic retrieval for AI systems
//! `rk-mem` — ReasonKit Mem utilities (clap version)
//!
//! Docset ingestion (Cursor-like `@Docs`) without requiring `reasonkit-org`.
//!
//! This version uses clap for argument parsing and follows ReasonKit CLI standards.

#[derive(Parser)]
#[command(name = "rk-mem", about = "ReasonKit Memory utilities", version)]
struct Cli {
    #[command(flatten)]
    base: CliBase,
    
    #[command(subcommand)]
    command: Commands,
}

#[derive(Subcommand)]
enum Commands {
    /// Docset ingestion and refresh (Cursor-like @Docs)
    Docs {
        #[command(subcommand)]
        action: DocsAction,
    },
    
    /// Show help information
    Help {
        /// Show help for a specific command
        command: Option<String>,
    },
}

#[derive(Subcommand)]
enum DocsAction {
    /// Add a new docset
    Add {
        /// Name of the docset
        name: String,
        /// Starting URL for the docset
        start_url: String,
        /// Allowed URL prefixes (defaults to start_url if not specified)
        allowed_prefixes: Option<Vec<String>>,
    },
    
    /// List all docsets
    List,
    
    /// Query documents in docsets
    Query {
        /// Search query
        query: String,
        
        /// Filter by docset name or ID
        #[arg(long)]
        docset: Option<String>,
        
        /// Number of results to return
        #[arg(short = 'k', long, default_value = "8")]
        top_k: usize,
        
        /// Output in JSON format
        #[arg(long)]
        json: bool,
    },
    
    /// Remove a docset
    Remove {
        /// Docset ID to remove
        docset_id: String,
        
        /// Keep the index (don't delete indexed documents)
        #[arg(long)]
        keep_index: bool,
    },
    
    /// Refresh docsets
    Refresh {
        /// Only refresh docsets whose refresh policy is due
        #[arg(long)]
        due: bool,
        
        /// Maximum pages to fetch per docset
        #[arg(long)]
        max_pages: Option<usize>,
        
        /// Concurrency level for fetching
        #[arg(long)]
        concurrency: Option<usize>,
        
        /// Request timeout in seconds
        #[arg(long)]
        timeout_secs: Option<u64>,
    },
}

#[derive(Debug, Serialize)]
struct DocsQueryHit {
    score: f32,
    sparse_score: Option<f32>,
    doc_id: String,
    chunk_id: String,
    url: Option<String>,
    title: Option<String>,
    docset: Option<String>,
    docset_id: Option<String>,
    text: String,
}

fn data_dir() -> PathBuf {
    if let Ok(dir) = std::env::var("RKMEM_DATA_DIR") {
        return PathBuf::from(dir);
    }
    dirs::data_local_dir()
        .unwrap_or_else(|| PathBuf::from("."))
        .join("reasonkit")
        .join("mem")
}

async fn cmd_docs_add(store: &DocsetStore, name: String, start_url: String, allowed_prefixes: Option<Vec<String>>) -> anyhow::Result<()> {
    let allowed_prefixes = allowed_prefixes.unwrap_or_else(|| vec![start_url.clone()]);
    
    let docset = Docset::new(name, start_url, allowed_prefixes);
    let saved = store.upsert(docset).await?;
    
    format_success(&format!("Added docset: {} ({})", saved.name, saved.id), None, OutputFormat::Text)?;
    Ok(())
}

async fn cmd_docs_list(store: &DocsetStore) -> anyhow::Result<()> {
    let docsets = store.load().await?;
    if docsets.is_empty() {
        println!("No docsets configured.");
        return Ok(());
    }
    
    let now = Utc::now();
    let mut output = String::new();
    output.push_str(&section_header("Docset List", true));
    
    for ds in docsets {
        let due = ds.is_due(now);
        output.push_str(&format!(
            "\n{}\n  name: {}\n  start_url: {}\n  refresh: {:?}\n  status: {:?}\n  due: {}\n",
            ds.id, ds.name, ds.start_url, ds.refresh, ds.status, due
        ));
    }
    
    println!("{}", output);
    Ok(())
}

async fn cmd_docs_remove(store: &DocsetStore, data_dir: &Path, docset_id: String, keep_index: bool) -> anyhow::Result<()> {
    let id = Uuid::parse_str(&docset_id).context("Invalid DOCSET_ID")?;
    
    let deleted = store.delete(id).await?;
    if !deleted {
        return Err(anyhow!("Docset not found: {}", id));
    }
    
    format_success(&format!("Removed docset config: {}", id), None, OutputFormat::Text)?;
    
    if keep_index {
        return Ok(());
    }
    
    // Best-effort: delete any indexed documents tagged with this docset id.
    let retriever = open_default_docset_retriever(data_dir.to_path_buf()).await?;
    let ctx = AccessContext::new(
        "rk-mem".to_string(),
        AccessLevel::Admin,
        "docs_remove".to_string(),
    );
    
    let doc_ids = retriever
        .storage()
        .list_documents(&ctx)
        .await
        .context("Failed to list documents")?;
    
    let wanted_tag = format!("docset_id:{id}");
    let mut removed_docs = 0usize;
    for doc_id in doc_ids {
        let doc = retriever
            .storage()
            .get_document(&doc_id, &ctx)
            .await
            .context("Failed to read document")?;
        let Some(doc) = doc else {
            continue;
        };
        if doc.metadata.tags.iter().any(|t| t == &wanted_tag) {
            let _ = retriever.delete_document(&doc_id).await;
            removed_docs += 1;
        }
    }
    
    println!("Removed {} document(s) from index.", removed_docs);
    Ok(())
}

async fn cmd_docs_query(
    store: &DocsetStore,
    data_dir: &Path,
    query: String,
    docset_filter: Option<String>,
    top_k: usize,
    json: bool,
) -> anyhow::Result<()> {
    let wanted_docset_id = if let Some(filter) = docset_filter {
        if let Ok(id) = Uuid::parse_str(&filter) {
            Some(id)
        } else {
            let docsets = store.load().await?;
            let Some(found) = docsets.iter().find(|d| d.name.eq_ignore_ascii_case(&filter)) else {
                return Err(anyhow!(
                    "Unknown docset: {} (expected name or UUID; see `rk-mem docs list`)",
                    filter
                ));
            };
            Some(found.id)
        }
    } else {
        None
    };
    
    let retriever = open_default_docset_retriever(data_dir.to_path_buf()).await?;
    let ctx = AccessContext::new(
        "rk-mem".to_string(),
        AccessLevel::Admin,
        "docs_query".to_string(),
    );
    
    // Over-fetch, then filter to docset-tagged docs.
    let candidate_k = if top_k >= 200 {
        top_k
    } else {
        top_k.saturating_mul(5).min(200)
    };
    let results = retriever.search_sparse(&query, candidate_k).await?;
    
    let mut doc_cache: HashMap<Uuid, Option<reasonkit_mem::Document>> = HashMap::new();
    let mut hits: Vec<DocsQueryHit> = Vec::with_capacity(top_k);
    
    for r in results {
        if hits.len() >= top_k {
            break;
        }
        
        let doc = if let Some(cached) = doc_cache.get(&r.doc_id) {
            cached.clone()
        } else {
            let loaded = retriever
                .storage()
                .get_document(&r.doc_id, &ctx)
                .await
                .context("Failed to read document")?;
            doc_cache.insert(r.doc_id, loaded.clone());
            loaded
        };
        
        let Some(doc) = doc else {
            continue;
        };
        
        // Restrict to docset docs by default.
        let docset_id_tag = doc
            .metadata
            .tags
            .iter()
            .find(|t| t.starts_with("docset_id:"))
            .cloned();
        if docset_id_tag.is_none() {
            continue;
        }
        
        if let Some(wanted) = wanted_docset_id {
            let wanted_tag = format!("docset_id:{wanted}");
            if !doc.metadata.tags.iter().any(|t| t == &wanted_tag) {
                continue;
            }
        }
        
        let docset_tag = doc
            .metadata
            .tags
            .iter()
            .find(|t| t.starts_with("docset:"))
            .map(|t| t.trim_start_matches("docset:").to_string());
        
        hits.push(DocsQueryHit {
            score: r.score,
            sparse_score: r.sparse_score,
            doc_id: r.doc_id.to_string(),
            chunk_id: r.chunk_id.to_string(),
            url: doc.source.url.clone(),
            title: doc.metadata.title.clone(),
            docset: docset_tag,
            docset_id: docset_id_tag.map(|t| t.trim_start_matches("docset_id:").to_string()),
            text: r.text,
        });
    }
    
    if json {
        println!("{}", serde_json::to_string_pretty(&hits)?);
        return Ok(());
    }
    
    if hits.is_empty() {
        println!("No results.");
        return Ok(());
    }
    
    println!("{}", section_header("Search Results", true));
    for (i, h) in hits.iter().enumerate() {
        let url = h.url.as_deref().unwrap_or("<no-url>");
        let title = h.title.as_deref().unwrap_or("<no-title>");
        println!(
            "#{:02} score={:.4} docset={} url={}\n  {}\n",
            i + 1,
            h.score,
            h.docset.as_deref().unwrap_or("<unknown>"),
            url,
            title
        );
    }
    
    Ok(())
}

async fn cmd_docs_refresh(
    store: &DocsetStore,
    data_dir: &Path,
    due: bool,
    max_pages: Option<usize>,
    concurrency: Option<usize>,
    timeout_secs: Option<u64>,
) -> anyhow::Result<()> {
    let mut opts = DocsetIngestOptions {
        manifest_dir: Some(data_dir.join("docsets")),
        refresh_due_only: due,
        ..Default::default()
    };
    
    if let Some(max) = max_pages {
        opts.max_pages = max;
    }
    if let Some(conc) = concurrency {
        opts.concurrency = conc;
    }
    if let Some(secs) = timeout_secs {
        opts.request_timeout = std::time::Duration::from_secs(secs);
    }
    
    let mut docsets = store.load().await?;
    if docsets.is_empty() {
        println!("No docsets configured.");
        return Ok(());
    }
    
    let retriever = open_default_docset_retriever(data_dir.to_path_buf()).await?;
    let ingestor = DocsetIngestor::new(retriever)?;
    
    println!("{}", section_header("Refreshing Docsets", true));
    
    for ds in docsets.iter_mut() {
        let res = ingestor.ingest_docset(ds, &opts).await;
        match res {
            Ok(report) => {
                println!(
                    "Docset {} ({}) — method={}, discovered={}, fetched={}, indexed={}, skipped={}, removed={}, failures={}",
                    report.docset_name,
                    report.docset_id,
                    report.discovery_method,
                    report.discovered_urls,
                    report.fetched_pages,
                    report.indexed_pages,
                    report.skipped_unchanged,
                    report.removed_pages,
                    report.failures
                );
            }
            Err(e) => {
                ds.status = RefreshStatus::Error {
                    at: Utc::now(),
                    message: e.to_string(),
                };
                eprintln!("Docset {} ({}) — ERROR: {}", ds.name, ds.id, e);
            }
        }
        
        // Persist latest status / timestamps.
        let _ = store.upsert(ds.clone()).await;
    }
    
    format_success("Refresh complete", None, OutputFormat::Text)?;
    Ok(())
}