leankg 0.16.7 - Docs.rs

pub mod context_parser;
pub mod data;
pub mod runner;
pub mod summary;

pub use context_parser::QualityMetrics;

use std::path::PathBuf;

pub use runner::{BenchmarkRunner, CliTool};

pub fn run(category: Option<String>, cli: CliTool) -> Result<(), Box<dyn std::error::Error>> {
    let prompts_dir = PathBuf::from("benchmark/prompts");
    let output_dir = PathBuf::from("benchmark/results");

    let categories = if let Some(cat) = category {
        vec![data::PromptCategory::from_yaml(
            &prompts_dir.join(format!("{}.yaml", cat)),
        )?]
    } else {
        data::PromptCategory::load_all(&prompts_dir)?
    };

    let runner = BenchmarkRunner::new(output_dir, cli);

    for cat in &categories {
        println!("\n=== Category: {} ===\n", cat.name);
        for task in &cat.tasks {
            println!("Running: {}", task.id);

            let with_leankg = runner.run_with_leankg(&task.prompt);
            let without_leankg = runner.run_without_leankg(&task.prompt);

            let overhead = with_leankg.overhead(&without_leankg);

            println!(
                "  With LeanKG: {} tokens (input: {}, cached: {})",
                with_leankg.total_tokens, with_leankg.input_tokens, with_leankg.cached_tokens
            );
            println!(
                "  Without: {} tokens (input: {}, cached: {})",
                without_leankg.total_tokens,
                without_leankg.input_tokens,
                without_leankg.cached_tokens
            );
            println!("  Overhead: {} tokens", overhead.token_delta);

            if !task.expected_files.is_empty() {
                let with_quality = with_leankg
                    .context
                    .as_ref()
                    .map(|c| QualityMetrics::calculate(&task.expected_files, &c.files_referenced));
                let without_quality = without_leankg
                    .context
                    .as_ref()
                    .map(|c| QualityMetrics::calculate(&task.expected_files, &c.files_referenced));

                if let Some(wq) = &with_quality {
                    println!(
                        "  LeanKG Quality: Precision={:.2} | Recall={:.2} | F1={:.2} | {}",
                        wq.precision,
                        wq.recall,
                        wq.f1_score,
                        wq.verdict()
                    );
                    println!("    Correct Files: {:?}", wq.correct_files);
                    if !wq.incorrect_files.is_empty() {
                        println!("    Incorrect (false positives): {:?}", wq.incorrect_files);
                    }
                    if !wq.missing_files.is_empty() {
                        println!("    Missing (false negatives): {:?}", wq.missing_files);
                    }
                } else {
                    println!("  LeanKG Quality: (context not available)");
                }

                if let Some(uq) = &without_quality {
                    println!(
                        "  Without LeanKG Quality: Precision={:.2} | Recall={:.2} | F1={:.2} | {}",
                        uq.precision,
                        uq.recall,
                        uq.f1_score,
                        uq.verdict()
                    );
                } else {
                    println!("  Without LeanKG Quality: (context not available)");
                }
            }
            println!();

            let _ = runner.save_comparison(&with_leankg, &without_leankg, &task.id);
        }
    }

    Ok(())
}