frigg 0.4.5

Local-first MCP server for code understanding.
Documentation
use std::error::Error;
use std::io;
use std::path::Path;

use frigg::settings::FriggConfig;
use frigg::storage::Storage;
use serde::Serialize;
use serde_json::{Map, Value, to_string_pretty};

use crate::WorkloadCorpusExportFormat;
use crate::cli_runtime::storage_paths::resolve_storage_db_path;

const WORKLOAD_CORPUS_MAX_STRING_CHARS: usize = 256;
const WORKLOAD_CORPUS_MAX_ARRAY_ITEMS: usize = 8;
const WORKLOAD_CORPUS_MAX_OBJECT_ENTRIES: usize = 16;
const WORKLOAD_CORPUS_MAX_DEPTH: usize = 6;

#[derive(Debug, Clone, Serialize)]
struct WorkloadCorpusExportRow {
    trace_id: String,
    created_at: String,
    repository_id: String,
    tool_name: String,
    parameter_summary: Value,
    outcome_summary: Value,
    source_refs_summary: Value,
    source_ref_count: usize,
    normalized_workload: Option<Value>,
}

fn bounded_workload_corpus_text(value: &str) -> String {
    if value.chars().count() <= WORKLOAD_CORPUS_MAX_STRING_CHARS {
        return value.to_owned();
    }

    let mut bounded = value
        .chars()
        .take(WORKLOAD_CORPUS_MAX_STRING_CHARS)
        .collect::<String>();
    bounded.push_str("...");
    bounded
}

fn sanitize_workload_corpus_value(value: &Value, remaining_depth: usize) -> Value {
    if remaining_depth == 0 {
        return Value::String("[truncated-depth]".to_owned());
    }

    match value {
        Value::Null | Value::Bool(_) | Value::Number(_) => value.clone(),
        Value::String(text) => Value::String(bounded_workload_corpus_text(text)),
        Value::Array(items) => Value::Array(
            items
                .iter()
                .take(WORKLOAD_CORPUS_MAX_ARRAY_ITEMS)
                .map(|item| sanitize_workload_corpus_value(item, remaining_depth - 1))
                .collect(),
        ),
        Value::Object(entries) => {
            let mut ordered_keys = entries.keys().cloned().collect::<Vec<_>>();
            ordered_keys.sort();

            let mut sanitized = Map::new();
            for key in ordered_keys
                .into_iter()
                .take(WORKLOAD_CORPUS_MAX_OBJECT_ENTRIES)
            {
                if let Some(entry_value) = entries.get(&key) {
                    sanitized.insert(
                        key,
                        sanitize_workload_corpus_value(entry_value, remaining_depth - 1),
                    );
                }
            }

            Value::Object(sanitized)
        }
    }
}

fn workload_corpus_summary_field(payload: &Value, key: &str) -> Value {
    payload
        .get(key)
        .map(|value| sanitize_workload_corpus_value(value, WORKLOAD_CORPUS_MAX_DEPTH))
        .unwrap_or(Value::Null)
}

pub(crate) fn run_workload_corpus_export_command(
    config: &FriggConfig,
    output_path: &Path,
    format: WorkloadCorpusExportFormat,
    limit: usize,
) -> Result<(), Box<dyn Error>> {
    if limit == 0 {
        return Err(Box::new(io::Error::other(
            "export-workload-corpus limit must be greater than zero",
        )));
    }

    let repositories = config.repositories();
    let mut rows = Vec::new();

    for repo in &repositories {
        let root = config.root_by_repository_id(&repo.repository_id.0).ok_or_else(|| {
            io::Error::other(format!(
                "export-workload-corpus summary status=failed repository_id={} error=workspace root lookup failed",
                repo.repository_id.0
            ))
        })?;
        let db_path = resolve_storage_db_path(root, "export-workload-corpus")?;
        let storage = Storage::new(&db_path);
        let repo_rows = storage
            .load_recent_provenance_events(limit)
            .map_err(|err| {
                io::Error::other(format!(
                    "export-workload-corpus failed for repository_id={} root={} db={}: {err}",
                    repo.repository_id.0,
                    root.display(),
                    db_path.display()
                ))
            })?;

        let exported_count = repo_rows.len();
        for row in repo_rows {
            let payload = serde_json::from_str::<Value>(&row.payload_json).unwrap_or_else(|_| {
                Value::Object(Map::from_iter([(
                    "payload_decode_error".to_owned(),
                    Value::String(bounded_workload_corpus_text(&row.payload_json)),
                )]))
            });
            let repository_id = payload
                .get("target_repository_id")
                .and_then(|value| value.as_str())
                .unwrap_or(&repo.repository_id.0)
                .to_owned();
            let source_refs_summary = workload_corpus_summary_field(&payload, "source_refs");
            let source_ref_count = payload
                .get("source_refs")
                .and_then(|value| value.as_array())
                .map_or(0, Vec::len);

            rows.push(WorkloadCorpusExportRow {
                trace_id: row.trace_id,
                created_at: row.created_at,
                repository_id,
                tool_name: row.tool_name,
                parameter_summary: workload_corpus_summary_field(&payload, "params"),
                outcome_summary: workload_corpus_summary_field(&payload, "outcome"),
                source_refs_summary,
                source_ref_count,
                normalized_workload: payload
                    .get("normalized_workload")
                    .map(|value| sanitize_workload_corpus_value(value, WORKLOAD_CORPUS_MAX_DEPTH)),
            });
        }

        println!(
            "export-workload-corpus ok repository_id={} root={} db={} rows={}",
            repo.repository_id.0,
            root.display(),
            db_path.display(),
            exported_count
        );
    }

    rows.sort_by(|left, right| {
        left.repository_id
            .cmp(&right.repository_id)
            .then(left.created_at.cmp(&right.created_at))
            .then(left.trace_id.cmp(&right.trace_id))
            .then(left.tool_name.cmp(&right.tool_name))
    });

    let parent = output_path.parent().ok_or_else(|| {
        io::Error::other(format!(
            "export-workload-corpus output path has no parent: {}",
            output_path.display()
        ))
    })?;
    std::fs::create_dir_all(parent)?;

    match format {
        WorkloadCorpusExportFormat::Json => {
            std::fs::write(output_path, to_string_pretty(&rows)?)?;
        }
        WorkloadCorpusExportFormat::Jsonl => {
            let mut encoded = String::new();
            for row in &rows {
                encoded.push_str(&serde_json::to_string(row)?);
                encoded.push('\n');
            }
            std::fs::write(output_path, encoded)?;
        }
    }

    println!(
        "export-workload-corpus summary status=ok repositories={} rows={} format={} output={} limit={}",
        repositories.len(),
        rows.len(),
        format.as_str(),
        output_path.display(),
        limit
    );
    Ok(())
}