toak-rs 6.0.3

A high-performance library and CLI tool for tokenizing git repositories, cleaning code, and generating embeddings
Documentation
mod cli;
mod markdown_generator;
mod token_cleaner;
#[cfg(feature = "embeddings")]
mod embeddings_generator;
mod text_chunker;
#[cfg(feature = "embeddings")]
mod json_database_generator;
#[cfg(feature = "embeddings")]
mod semantic_search;

use clap::Parser;
use cli::{Args, Commands};
use markdown_generator::{MarkdownGenerator, MarkdownGeneratorOptions};
#[cfg(feature = "embeddings")]
use json_database_generator::{JsonDatabaseGenerator, JsonDatabaseOptions};
#[cfg(feature = "embeddings")]
use semantic_search::SemanticSearch;

#[tokio::main]
async fn main() {
  let args = Args::parse();

  match args.command {
    Commands::Version => {
      println!("toak {}", env!("CARGO_PKG_VERSION"));
    }
    Commands::Generate {
      dir,
      output_file_path,
      quiet,
      prompt: _,
      embeddings,
    } => {
      run_generate(dir, output_file_path, quiet, embeddings).await;
    }
    #[cfg(feature = "embeddings")]
    Commands::Search {
      query,
      embeddings_file,
      top_n,
      full,
    } => {
      run_search(&query, &embeddings_file, top_n, full);
    }
  }
}

async fn run_generate(
  dir: Option<std::path::PathBuf>,
  output_file_path: Option<std::path::PathBuf>,
  quiet: bool,
  generate_embeddings: bool,
) {
  println!("RUNNING TOKENIZER");

  let dir = dir.unwrap_or_else(|| std::path::PathBuf::from("."));
  let output_file_path = output_file_path.unwrap_or_else(|| std::path::PathBuf::from("prompt.md"));
  let verbose = !quiet;

  // Generate markdown document
  let markdown_options = MarkdownGeneratorOptions {
    dir: dir.clone(),
    output_file_path,
    verbose,
    ..MarkdownGeneratorOptions::default()
  };

  let mut markdown_generator = MarkdownGenerator::new_for_cli(markdown_options, generate_embeddings);

  match markdown_generator.create_markdown_document().await {
    Ok(result) => {
      if !result.success {
        eprintln!("Markdown generation failed");
        std::process::exit(1);
      }
    }
    Err(e) => {
      eprintln!("Error generating markdown: {}", e);
      std::process::exit(1);
    }
  }

  if generate_embeddings {
    generate_embeddings_database(dir, verbose).await;
  }

  println!("\n✓ All tasks completed successfully!");
}

#[cfg(feature = "embeddings")]
async fn generate_embeddings_database(dir: std::path::PathBuf, verbose: bool) {
  // Generate JSON database with embeddings
  println!("\nGenerating embeddings database...");

  let embeddings_output_path = dir.join("embeddings.json");
  let json_options = JsonDatabaseOptions {
    dir,
    output_file_path: embeddings_output_path,
    verbose,
    max_concurrent_files: 4,
    ..JsonDatabaseOptions::default()
  };

  let json_generator = match JsonDatabaseGenerator::new(json_options) {
    Ok(generator) => generator,
    Err(e) => {
      eprintln!("Error initializing embeddings generator: {}", e);
      std::process::exit(1);
    }
  };

  match json_generator.generate_database().await {
    Ok(result) => {
      if !result.success {
        eprintln!("Embeddings generation failed");
        std::process::exit(1);
      }
      println!(
        "Successfully generated embeddings for {} files ({} chunks)",
        result.total_files, result.total_chunks
      );
    }
    Err(e) => {
      eprintln!("Error generating embeddings: {}", e);
      std::process::exit(1);
    }
  }
}

#[cfg(not(feature = "embeddings"))]
async fn generate_embeddings_database(_dir: std::path::PathBuf, _verbose: bool) {
  eprintln!("Embeddings support is not enabled in this build.");
  std::process::exit(1);
}

#[cfg(feature = "embeddings")]
fn run_search(query: &str, embeddings_file: &std::path::Path, top_n: usize, full: bool) {
  // Load the semantic search engine
  let mut search = match SemanticSearch::new(embeddings_file) {
    Ok(search) => search,
    Err(e) => {
      eprintln!("Error loading embeddings database: {}", e);
      eprintln!("Make sure {} exists. Run 'toak generate' first.", embeddings_file.display());
      std::process::exit(1);
    }
  };

  // Display search info
  let metadata = search.metadata();
  println!("Searching {} chunks from {} files", metadata.total_chunks, metadata.total_files);
  println!("Query: \"{}\"\n", query);

  // Perform the search
  let results = match search.search(query, top_n) {
    Ok(results) => results,
    Err(e) => {
      eprintln!("Error performing search: {}", e);
      std::process::exit(1);
    }
  };

  if results.is_empty() {
    println!("No results found.");
    return;
  }

  // Display results
  println!("Top {} results:\n", results.len());
  println!("{}", "=".repeat(80));

  for (i, result) in results.iter().enumerate() {
    println!("\n{}. {} (similarity: {:.4})", i + 1, result.file_path, result.similarity);
    println!("{}", "-".repeat(80));

    if full {
      // Show full content
      println!("{}", result.content);
    } else {
      // Show preview (first 5 lines)
      let preview: Vec<&str> = result.content.lines().take(5).collect();
      println!("{}", preview.join("\n"));

      let total_lines = result.content.lines().count();
      if total_lines > 5 {
        println!("\n... ({} more lines, use --full to see all)", total_lines - 5);
      }
    }
  }

  println!("\n{}", "=".repeat(80));
}