keyclaw 0.2.1

Local MITM proxy that keeps secrets out of LLM traffic
Documentation
//! Second-pass secret scanning via the external `kingfisher` binary.

use std::fs;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::sync::Arc;

use serde::Deserialize;

use crate::errors::KeyclawError;
use crate::gitleaks_rules::{MatchConfidence, MatchSource};
use crate::placeholder::{self, Replacement};

const KINGFISHER_SUCCESS_WITH_FINDINGS: i32 = 200;
const KINGFISHER_SUCCESS_WITH_VALIDATED_FINDINGS: i32 = 205;

pub fn default_scanner() -> Option<Arc<SecondPassScanner>> {
    if !command_available("kingfisher") {
        return None;
    }

    Some(Arc::new(SecondPassScanner::new("kingfisher")))
}

pub struct SecondPassScanner {
    binary: PathBuf,
}

impl SecondPassScanner {
    pub fn new(binary: impl Into<PathBuf>) -> Self {
        Self {
            binary: binary.into(),
        }
    }

    pub fn from_binary(binary: impl Into<PathBuf>) -> Result<Self, KeyclawError> {
        let binary = binary.into();
        if (binary.is_absolute() || binary.components().count() > 1)
            && fs::metadata(&binary).is_ok()
        {
            return Ok(Self { binary });
        }

        if command_available(&binary) {
            Ok(Self { binary })
        } else {
            Err(KeyclawError::uncoded(format!(
                "kingfisher binary not found at {}",
                binary.display()
            )))
        }
    }

    pub fn replace_secrets<F>(
        &self,
        input: &str,
        decoded_depth: u8,
        mut on_secret: F,
    ) -> Result<(String, Vec<Replacement>), KeyclawError>
    where
        F: FnMut(&str) -> Result<String, KeyclawError>,
    {
        let findings = self.scan(input)?;
        if findings.is_empty() {
            return Ok((input.to_string(), Vec::new()));
        }

        let mut out = String::with_capacity(input.len());
        let mut replacements = Vec::with_capacity(findings.len());
        let mut last = 0usize;

        for finding in findings {
            if finding.start < last
                || finding.end > input.len()
                || !input.is_char_boundary(finding.start)
                || !input.is_char_boundary(finding.end)
            {
                continue;
            }

            let secret = &input[finding.start..finding.end];
            if placeholder::contains_placeholder_prefix(secret) {
                continue;
            }

            let id = on_secret(secret)?;
            let rendered = placeholder::make(&id);

            out.push_str(&input[last..finding.start]);
            out.push_str(&rendered);
            last = finding.end;

            replacements.push(Replacement {
                rule_id: finding.rule_id,
                id,
                placeholder: rendered,
                secret: secret.to_string(),
                source: MatchSource::Regex,
                confidence: finding.confidence,
                confidence_score: finding.confidence_score,
                entropy: finding.entropy,
                decoded_depth,
            });
        }

        if replacements.is_empty() {
            return Ok((input.to_string(), replacements));
        }

        out.push_str(&input[last..]);
        Ok((out, replacements))
    }

    fn scan(&self, input: &str) -> Result<Vec<SecondPassFinding>, KeyclawError> {
        let temp = tempfile::tempdir()
            .map_err(|err| KeyclawError::uncoded_with_source("create kingfisher tempdir", err))?;
        let input_path = temp.path().join("payload.txt");
        let output_path = temp.path().join("findings.json");

        fs::write(&input_path, input)
            .map_err(|err| KeyclawError::uncoded_with_source("write kingfisher payload", err))?;

        let output = Command::new(&self.binary)
            .arg("scan")
            .arg(&input_path)
            .arg("--format")
            .arg("json")
            .arg("--output")
            .arg(&output_path)
            .arg("--no-validate")
            .arg("--no-base64")
            .arg("--no-dedup")
            .arg("--confidence")
            .arg("medium")
            .output()
            .map_err(|err| KeyclawError::uncoded_with_source("run kingfisher", err))?;

        let exit_code = output.status.code();
        if !matches!(
            exit_code,
            Some(0 | KINGFISHER_SUCCESS_WITH_FINDINGS | KINGFISHER_SUCCESS_WITH_VALIDATED_FINDINGS)
        ) {
            let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
            let message = if stderr.is_empty() {
                format!("kingfisher exited with status {:?}", output.status.code())
            } else {
                format!(
                    "kingfisher exited with status {:?}: {}",
                    output.status.code(),
                    stderr
                )
            };
            return Err(KeyclawError::uncoded(message));
        }

        let report_bytes = match fs::read(&output_path) {
            Ok(bytes) => bytes,
            Err(err) if exit_code == Some(0) => {
                if err.kind() == std::io::ErrorKind::NotFound {
                    return Ok(Vec::new());
                }
                return Err(KeyclawError::uncoded_with_source(
                    "read kingfisher output",
                    err,
                ));
            }
            Err(err) => {
                return Err(KeyclawError::uncoded_with_source(
                    "read kingfisher output",
                    err,
                ));
            }
        };

        if report_bytes.is_empty() {
            return Ok(Vec::new());
        }

        let report: CliReportEnvelope = serde_json::from_slice(&report_bytes)
            .map_err(|err| KeyclawError::uncoded_with_source("parse kingfisher output", err))?;
        let mut findings = Vec::with_capacity(report.findings.len());

        for record in report.findings {
            let Some(start) = byte_offset_for_line_column(
                input,
                record.finding.line,
                record.finding.column_start,
            ) else {
                continue;
            };
            let Some(end) =
                byte_offset_for_line_column(input, record.finding.line, record.finding.column_end)
            else {
                continue;
            };
            if start >= end {
                continue;
            }

            findings.push(SecondPassFinding {
                rule_id: record.rule.id,
                start,
                end,
                confidence: parse_confidence(&record.finding.confidence),
                confidence_score: confidence_score(&record.finding.confidence),
                entropy: record.finding.entropy.parse::<f64>().ok(),
            });
        }

        findings.sort_by(|left, right| {
            left.start
                .cmp(&right.start)
                .then_with(|| (right.end - right.start).cmp(&(left.end - left.start)))
                .then_with(|| left.rule_id.cmp(&right.rule_id))
        });

        Ok(findings)
    }
}

pub fn default_binary_available() -> bool {
    command_available("kingfisher")
}

fn command_available(command: impl AsRef<Path>) -> bool {
    Command::new(command.as_ref())
        .arg("--version")
        .output()
        .map(|output| output.status.success())
        .unwrap_or(false)
}

fn parse_confidence(confidence: &str) -> MatchConfidence {
    match confidence.trim().to_ascii_lowercase().as_str() {
        "high" => MatchConfidence::High,
        "medium" => MatchConfidence::Medium,
        _ => MatchConfidence::Low,
    }
}

fn confidence_score(confidence: &str) -> u8 {
    match confidence.trim().to_ascii_lowercase().as_str() {
        "high" => 90,
        "medium" => 70,
        _ => 40,
    }
}

fn byte_offset_for_line_column(input: &str, line: u32, column: u32) -> Option<usize> {
    if line == 0 {
        return None;
    }

    let mut current_line = 1u32;
    let mut line_start = 0usize;

    for segment in input.split_inclusive('\n') {
        if current_line == line {
            let candidate = line_start + column as usize;
            let line_len = segment.strip_suffix('\n').unwrap_or(segment).len();
            return (column as usize <= line_len).then_some(candidate);
        }
        line_start += segment.len();
        current_line += 1;
    }

    if input.ends_with('\n') && current_line == line {
        let candidate = line_start + column as usize;
        let line_len = input.len().saturating_sub(line_start);
        return (column as usize <= line_len).then_some(candidate);
    }

    None
}

#[derive(Deserialize)]
struct CliReportEnvelope {
    #[serde(default)]
    findings: Vec<CliFindingRecord>,
}

#[derive(Deserialize)]
struct CliFindingRecord {
    rule: CliRuleMetadata,
    finding: CliFindingData,
}

#[derive(Deserialize)]
struct CliRuleMetadata {
    id: String,
}

#[derive(Deserialize)]
struct CliFindingData {
    confidence: String,
    entropy: String,
    line: u32,
    column_start: u32,
    column_end: u32,
}

struct SecondPassFinding {
    rule_id: String,
    start: usize,
    end: usize,
    confidence: MatchConfidence,
    confidence_score: u8,
    entropy: Option<f64>,
}

#[cfg(test)]
mod tests {
    use super::byte_offset_for_line_column;

    #[test]
    fn byte_offsets_follow_line_boundaries() {
        let input = "alpha\nbeta\ngamma";

        assert_eq!(byte_offset_for_line_column(input, 1, 5), Some(5));
        assert_eq!(byte_offset_for_line_column(input, 2, 4), Some(10));
        assert_eq!(byte_offset_for_line_column(input, 2, 5), None);
        assert_eq!(byte_offset_for_line_column(input, 4, 0), None);
    }

    #[test]
    fn byte_offsets_allow_trailing_empty_line() {
        let input = "alpha\n";

        assert_eq!(byte_offset_for_line_column(input, 2, 0), Some(6));
        assert_eq!(byte_offset_for_line_column(input, 2, 1), None);
    }

    #[test]
    fn byte_offsets_use_utf8_byte_columns() {
        let input = "first\nésecret";

        assert_eq!(byte_offset_for_line_column(input, 2, 0), Some(6));
        assert_eq!(byte_offset_for_line_column(input, 2, 2), Some(8));
        assert_eq!(&input[8..], "secret");
    }
}